In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from unidecode import unidecode
from imblearn.under_sampling import NearMiss
import seaborn as sns
import re
import datetime
from sklearn import preprocessing
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/uf-dataset/UFCG.csv


# Loading UFCG dataset!

In [44]:
data= pd.read_csv("/kaggle/input/uf-dataset/UFCG.csv")
#Seting columns names
data.columns = ["Notification state", "Notification municipality",
                "Are you a health professional?", "Ethnicity", "Case evolution",
                "Test Type", "Residence Status", "Total Results", "Notification Date",
                "IgA Result", "CBO","Symptom- Throat Pain", "Symptom- Dyspnea","Symptom- Fever",
                "Symptom- Cough", "Symptom- Others","Symptom- Headache","Symptom- Taste Disorders",
                "Symptom- Olfactory Disorders", "Symptom- Coryza","Symptom- Asymptomatic", "CEP",
                "Safety Professional", "Result (PCR / Rapids)", "Race / Color", "Serological Test",
                "Gender", "Foreigner", "IgM Result", "Test Status", "Test Date (Serological)", "IgG Result",
                "Conditions- Decompensated chronic respiratory diseases", "Conditions- Chronic heart diseases",
                "Conditions- Diabetes","Conditions- Chronic kidney diseases in advanced stage (grades 3, 4 or 5)",
                "Conditions- Immunosuppression", "Conditions- Pregnant", "Conditions- Carrier of chromosomal diseases or state of immunological fragility",
                "Conditions- Postpartum (up to 45 days after delivery)", "Conditions- Obesity","neighborhood",
                "PCR / Rapid test date","description of symptoms", "closing date", "final classification",
                "municipality of residence","Symptom onset date","Mother's Full Name", "Cell Phone"]
ori_len=len(data)
print("Original dataset len:", len(data))

Original dataset len: 55676


  interactivity=interactivity, compiler=compiler, result=result)


In [45]:
data=data.fillna('n')
#Dropping repeated rows
data=data.drop_duplicates(subset=data.columns,keep='first')
no_dupl=len(data)
print("deleted duplicated rows:",ori_len - no_dupl)


deleted duplicated rows: 251


# Filtering rows marked only as completed('Concluído') test,final classification as confirmed or discarded, no asymptomatic patient and only patients with test result defined as positive or negative!

In [46]:
dt=data[(data["Test Status"]=='Concluído')]
dt_len1=len(dt)
print("removed rows of test status not conluid:",no_dupl - dt_len1)

dt=dt[(dt["final classification"]!='n') & (dt["final classification"]!='Síndrome Gripal Não Especificada ')]
dt_len2=len(dt)
print("removed rows of final classification iqual to null and unspecified flu syndrome:",dt_len1 - dt_len2)

dt=dt[dt["Symptom- Asymptomatic"]=='Não']
dt_len3=len(dt)
print("removed rows of Asymptomatic patients:",dt_len2 - dt_len3)

dt=dt[(dt["Result (PCR / Rapids)"]!='n') & (dt["Result (PCR / Rapids)"]!='Inconclusivo ou Indeterminado')]
dt_len4=len(dt)
print("removed rows of test result iqual to null and inconclusive/indeterminate :",dt_len3 - dt_len4)


removed rows of test status not conluid: 4506
removed rows of final classification iqual to null and unspecified flu syndrome: 8365
removed rows of Asymptomatic patients: 1443
removed rows of test result iqual to null and inconclusive/indeterminate : 58


# Selecting columns and parting dataset!


we will only use pcr and antibody tests!

In [47]:

test_pcr=dt[(dt["Test Type"]=='RT-PCR')] 
test_rapid=dt[(dt["Test Type"]=='TESTE RÁPIDO - ANTICORPO')|
                  (dt["Test Type"]=='TESTE RÁPIDO - ANTÍGENO')]

columns=["Symptom- Throat Pain", "Symptom- Dyspnea","Symptom- Fever",
        "Symptom- Cough","Symptom- Headache","Symptom- Taste Disorders",
        "Symptom- Olfactory Disorders", "Symptom- Coryza","description of symptoms",
         "Gender","Are you a health professional?","Result (PCR / Rapids)"]
test_pcr=test_pcr[columns]
test_rapid=test_rapid[columns]
both_test_len=len(test_pcr)+len(test_rapid)
print("pcr test len:",len(test_pcr),'----',"rapid test len:",len(test_rapid))
print("removed rows of another´s types of test:",dt_len4 - both_test_len)

pcr test len: 5691 ---- rapid test len: 34591
removed rows of another´s types of test: 771


# Resolving inconsistency on the symptom columns,since, most symptoms columns are marked as no(não), but in the symptom description column, the same patient has the symptoms!

example: the column Symptom- Headache is marked as no(Não), but,the column description of symptoms,
shows that the patient has a headache(DOR DE CABEÇA)

In [48]:

test_pcr.loc[55644]

Symptom- Throat Pain                                                   Não
Symptom- Dyspnea                                                       Não
Symptom- Fever                                                         Não
Symptom- Cough                                                         Não
Symptom- Headache                                                      Não
Symptom- Taste Disorders                                               Não
Symptom- Olfactory Disorders                                           Não
Symptom- Coryza                                                        Não
description of symptoms           DOR DE CABEÇA, TONTURA, FRAQUEZA, CORIZA
Gender                                                            Feminino
Are you a health professional?                                         Não
Result (PCR / Rapids)                                             Positivo
Name: 55644, dtype: object

Before applying the algorithm to update the symptom columns, we performed an analysis of the most frequent symptoms described in the symptom description column!

In [49]:
def update_symptoms_columns(result):
    Lsymt=data["description of symptoms"]
    k=0
    l=0
    for symt in Lsymt:
        #removing accentuation
        symt=unidecode(symt)
        symt=symt.upper()
        #removing special symbols
        symt=re.split( r'[),;.+/(]',symt)
            
        for s in symt:
                s=s.split()
                
                if('SAUDE' in s):
                    result.iloc[k:k+1,10:11]='Sim'
                
                if(('CEFALEIA' in s) | ('CABECA' in s)| ('CEFALEIAA' in s)| ('CEFAALEIA' in s)):
                    result.iloc[k:k+1,4:5]='Sim'

                if('Febre' in s):
                    result.iloc[:,2:3]='Sim'

                if(('TOSE' in s) | ('TOSSE' in s)| ('TOSSINDO' in s)):
                    result.iloc[k:k+1,3:4]='Sim'
                

                if('CORIZA' in s)| ('CORIZE' in s)| ('CORISA' in s): 
                    result.iloc[k:k+1,7:8]='Sim'
                    
                if(('OLFATO' in s)|('ANOSMIA' in s)|('ANSMIA' in s)|('CHEIRO' in s)):
                    result.iloc[k:k+1,6:7]='Sim'

                if(('PALADAR' in s)|('AGEUSIA' in s)|('AUGESIA' in s)|
                   ('APETITE' in s)|('SABOR' in s)|('INAPETENCIA' in s)):
                    result.iloc[k:k+1,5:6]='Sim'

                if(('RESPIRATORIO' in s)|('RESPIRACAO' in s)|
                     ('RESPIRAR' in s) |('AR' in s)| ('RESPIRATORIA'in s)|
                     ('ASMATICA'in s)|('RESP' in s)|('DISPNEIA' in s)):
                    result.iloc[k:k+1,1:2]='Sim'      
               
        k+=1
        
    return result


test_pcr=update_symptoms_columns(test_pcr)
test_rapid=update_symptoms_columns(test_rapid)


#Deleting column "description of symptoms"
del test_pcr["description of symptoms"]
del test_rapid["description of symptoms"]



# Drop rows with all the symptoms marked as no(Não)!

In [50]:
def drop_rows_without_no_sym(dt_covid):
    dt_drop=dt_covid[(dt_covid["Symptom- Throat Pain"]=="Não") & (dt_covid["Symptom- Dyspnea"]=="Não")
        & (dt_covid["Symptom- Fever"]=="Não") & (dt_covid["Symptom- Cough"]=="Não") 
        & (dt_covid["Symptom- Headache"]=="Não") & (dt_covid["Symptom- Taste Disorders"]=="Não")
        & (dt_covid["Symptom- Olfactory Disorders"]=="Não") & (dt_covid["Symptom- Coryza"]=="Não")]
    dt_covid=dt_covid.drop(dt_drop.index)
    return dt_covid

test_pcr=drop_rows_without_no_sym(test_pcr)
test_rapid=drop_rows_without_no_sym(test_rapid) 
 

both_test_len2=len(test_pcr)+len(test_rapid)
print("pcr test len:",len(test_pcr),'----',"rapid test len:",len(test_rapid))

print("removed rows of Asymptomatic patients:",both_test_len-both_test_len2)

pcr test len: 4430 ---- rapid test len: 26026
removed rows of Asymptomatic patients: 9826


# Pre-processing dates!

In [51]:
def split_dates(date):
    lista_date=[]
   
    for k in date:
    
        lista_date.append(k.split('/'))
    return lista_date

def create_colunm_days_after_onset(lista_date_teste,lista_date_sin,dt_covid):
    dt_covid['days after onset']=None
    lista_days=[]
    
    for l in range(len(lista_date_teste)):
        year1=int(lista_date_teste[l][2])
        year2=int(lista_date_sin[l][2])
        
        month1=int(lista_date_teste[l][1])
        month2=int(lista_date_sin[l][1])
        
        day1=int(lista_date_teste[l][0])
        day2=int(lista_date_sin[l][0])
        if(month1>12):
            old_day1=day1
            day1=month1
            month1=old_day1
            
        if(month2>12):
            old_day2=day2
            day2=month2
            month2=old_day2
            
        #convert dates to days
        d0 = datetime.date(year1,month1,day1)
        d1 = datetime.date(year2,month2,day2)
        delta = d0 - d1
        lista_days.append(delta.days)
    #set days after onset values
    dt_covid['days after onset']=lista_days
    Standardisation = preprocessing.StandardScaler()
    #Drop dates columns
    del dt_covid["Symptom onset date"]
    del dt_covid["PCR / Rapid test date"]
    return dt_covid

# Pre-processing Gender!
we eliminated rows with Gender marked as undefined(Indefinido)!

In [52]:
def pre_processing_gender(dt_covid):
    dt_covid=dt_covid[dt_covid['Gender']!='Indefinido']
    return dt_covid
test_pcr=pre_processing_gender(test_pcr)
test_rapid=pre_processing_gender(test_rapid)

both_test_len3=len(test_pcr)+len(test_rapid)
print("pcr test len:",len(test_pcr),'----',"rapid test len:",len(test_rapid))
print("removed rows of undefined gender :",both_test_len2-both_test_len3)

pcr test len: 4430 ---- rapid test len: 25999
removed rows of undefined gender : 27


In [53]:

def call_function(data):
    data=data[(data["PCR / Rapid test date"]!='n') & (data["Symptom onset date"]!='n')].copy()
    lista_date_sin=split_dates(data["Symptom onset date"])
    lista_date_teste=split_dates(data["PCR / Rapid test date"])
    data=create_colunm_days_after_onset(lista_date_teste,lista_date_sin,data)
    
    return data
"""
test_antibody=call_function(test_antibody)

#get just antibody test with days after onset between, 8- days
test_antibody=test_antibody[(test_antibody['days after onset']>=8)]
del test_antibody['days after onset']
test_antigen=call_function(test_antigen)

#get just antigen test with days after onset between, 2-7 days
test_antigen=test_antigen[(test_antigen['days after onset']>=2) & (test_antigen['days after onset']<=7)]
del test_antigen['days after onset']


test_pcr=call_function(test_pcr)

#get just pcr test with days after onset between, 0- days
test_pcr=test_pcr[(test_pcr['days after onset']>=0)]
del test_pcr['days after onset']

both_test_len4=len(test_pcr)+len(test_antibody)+len(test_antigen)
print("pcr test len:",len(test_pcr),'----',"antibody test len:",len(test_antibody),'----',
      "antigen test len:",len(test_antigen))
print("removed lines:",both_test_len3-both_test_len4)
"""


'\ntest_antibody=call_function(test_antibody)\n\n#get just antibody test with days after onset between, 8- days\ntest_antibody=test_antibody[(test_antibody[\'days after onset\']>=8)]\ndel test_antibody[\'days after onset\']\ntest_antigen=call_function(test_antigen)\n\n#get just antigen test with days after onset between, 2-7 days\ntest_antigen=test_antigen[(test_antigen[\'days after onset\']>=2) & (test_antigen[\'days after onset\']<=7)]\ndel test_antigen[\'days after onset\']\n\n\ntest_pcr=call_function(test_pcr)\n\n#get just pcr test with days after onset between, 0- days\ntest_pcr=test_pcr[(test_pcr[\'days after onset\']>=0)]\ndel test_pcr[\'days after onset\']\n\nboth_test_len4=len(test_pcr)+len(test_antibody)+len(test_antigen)\nprint("pcr test len:",len(test_pcr),\'----\',"antibody test len:",len(test_antibody),\'----\',\n      "antigen test len:",len(test_antigen))\nprint("removed lines:",both_test_len3-both_test_len4)\n'

# Resolving inconsistency "Result (PCR / Rapids)"column,since,there are patients with equals fields values and class("Result (PCR / Rapids)" marked as positive and negative!

This inconsistency occurs because there are patients with only demographic information that differentiate them,but, we are not using !

In [54]:
def part_dataset(dt_covid,sub):
    
    #part dataset with only class("Result (PCR / Rapids)") marked as negative
    neg=dt_covid[dt_covid["Result (PCR / Rapids)"]=='Negativo']
    
    #part dataset with only class("Result (PCR / Rapids)") marked as positive
    pos=dt_covid[dt_covid["Result (PCR / Rapids)"]=='Positivo']
    del neg["Result (PCR / Rapids)"]
    del pos["Result (PCR / Rapids)"]
    
    #part dataset with only unique values fom dataset pos
    pos_uniq=pos.drop_duplicates(subset=sub)
    
    neg_array=neg.iloc[:,:].values
    pos_uniq_array=pos_uniq.iloc[:,:].values
    pos_dupli_array=pos.iloc[:,:].values
    
    return neg,pos,neg_array, pos_uniq_array,pos_dupli_array

sub1=["Symptom- Throat Pain", "Symptom- Dyspnea","Symptom- Fever",
        "Symptom- Cough","Symptom- Headache","Symptom- Taste Disorders","Symptom- Olfactory Disorders",
        "Symptom- Coryza","Gender","Are you a health professional?"]

rapid_test=test_rapid.copy()
neg_pcr,pos_pcr,neg_array_pcr,pos_uniq_array_pcr,pos_dupli_array_pcr=part_dataset(test_pcr,sub1)
neg_ant,pos_ant,neg_array_ant,pos_uniq_array_ant,pos_dupli_array_ant=part_dataset(rapid_test,sub1)  



In [None]:
def resolve_class_conflict(d_ori,pos_uniq_array,l_comp):
    lpp=dict()
    dict_count=dict()
    #run all the unique values from positive dataset
    for k in range(len(pos_uniq_array)):
        p=0
        lista_index=[]
        #run all the values to compare if is equal to the unique values from positive dataset
        for i in range(len(l_comp)):

            comparison=pos_uniq_array[k:k+1,:]==l_comp[i:i+1,:]
             #if the values are equal
            if(comparison.all()):
                #count how many equal values exist!
                p+=1
                #add l_comp value index in lista_index!
                lista_index.append(d_ori.iloc[i:i+1,:].index[0])
        #hold the number of same values
        dict_count[k]=p
        #hold all the lists of index
        lpp[k]=lista_index
    return dict_count,lpp
#pcr
dict_count_pcr,lpp_pcr=resolve_class_conflict(pos_pcr,pos_uniq_array_pcr,pos_dupli_array_pcr)
dict_count_neg_pcr,lpn_pcr=resolve_class_conflict(neg_pcr,pos_uniq_array_pcr, neg_array_pcr)
#antibody
dict_count_ant,lpp_ant=resolve_class_conflict(pos_ant,pos_uniq_array_ant,pos_dupli_array_ant)
dict_count_neg_ant,lpn_ant=resolve_class_conflict(neg_ant,pos_uniq_array_ant, neg_array_ant)


In [None]:
def get_conflict_values_keys(dict_count_neg,dict_count):
    dnp=pd.DataFrame(index=dict_count_neg.keys())
    dnp['neg']=dict_count_neg.values()
    dnp['pos']=dict_count.values()
    dnp=dnp[(dnp['neg']>0) & (dnp['pos']>0)]
    #get just the keys which the total rows number of neg values is less than pos values
    neg_smaller=dnp[dnp['neg'] < dnp['pos']]
    #get just the keys which the total rows number of pos values is less than neg values
    pos_smaller=dnp[dnp['neg'] > dnp['pos']]
    #get just the keys which the total rows number of pos values is equal to neg values
    #pos_neg_equal=dnp[dnp['neg'] == dnp['pos']]
    return neg_smaller, pos_smaller
#pcr
neg_smaller_pcr,pos_smaller_pcr=get_conflict_values_keys(dict_count_neg_pcr,dict_count_pcr)

#antibody
neg_smaller_ant,pos_smaller_ant=get_conflict_values_keys(dict_count_neg_ant,dict_count_ant)



In [None]:
def get_index(data,l_data,l_index):
    
    for x in data.index:
        for k in l_data[x]:
            l_index.append(k)
    
       
    return l_index
#pcr
l_index_pcr=[]
l_index_pcr=get_index(neg_smaller_pcr,lpn_pcr,l_index_pcr)
l_index_pcr=get_index(pos_smaller_pcr,lpp_pcr,l_index_pcr)
#l_index_pcr=get_index(pos_neg_equal_pcr,lpn_pcr,l_index_pcr)
#l_index_pcr=get_index(pos_neg_equal_pcr,lpp_pcr,l_index_pcr)
#antibody
l_index_ant=[]
l_index_ant=get_index(neg_smaller_ant,lpn_ant,l_index_ant)
l_index_ant=get_index(pos_smaller_ant,lpp_ant,l_index_ant)
#l_index_ant=get_index(pos_neg_equal_ant,lpn_ant,l_index_ant)
#l_index_ant=get_index(pos_neg_equal_ant,lpp_ant,l_index_ant)


Drop inconsistency values!

In [None]:
test_pcr=test_pcr.drop(l_index_pcr)
rapid_test=rapid_test.drop(l_index_ant)

both_test_len5=len(test_pcr)+len(rapid_test)
print("pcr test len:",len(test_pcr),'----',"rapid test len:",len(rapid_test))
print("removed rows of inconsistency values:",both_test_len3-both_test_len5)


Scaling feature days after onset!

In [None]:
def scale_days(data):
    Standardisation = preprocessing.StandardScaler() 
  
    # Scaled feature 
    data['days after onset']= Standardisation.fit_transform(data.iloc[:, 11:12].values)
    return data
#test_pcr=scale_days(test_pcr)
#test_antibody=scale_days(test_antibody)

In [None]:
def convert_to_binary(dt_covid):
    dt_covid=dt_covid.replace(to_replace =["Sim"],value =0) 
    dt_covid=dt_covid.replace(to_replace =["Não"],value =1)
    dt_covid=dt_covid.replace(to_replace =["Positivo"],value=0) 
    dt_covid=dt_covid.replace(to_replace =["Negativo"],value=1)
    dt_covid=dt_covid.replace(to_replace =["Feminino"],value=0)
    dt_covid=dt_covid.replace(to_replace =["Masculino"],value=1)
    #drop none values
    dt_covid=dt_covid.replace(to_replace =["n"],value =None)
    dt_covid=dt_covid.dropna()
    return dt_covid
test_pcr=convert_to_binary(test_pcr)
rapid_test=convert_to_binary(rapid_test)
test_pcr['Gender']=test_pcr['Gender'].astype(int)
rapid_test['Gender']=rapid_test['Gender'].astype(int)

both_test_len6=len(test_pcr)+len(rapid_test)
print("pcr test len:",len(test_pcr),'----',"rapid test len:",len(rapid_test))
print("removed lines:",both_test_len5-both_test_len6)

# Unbalanced dataset

In [None]:
pcr_unb=test_pcr.copy()
rapid_unb=rapid_test.copy()
both_data_unb=pd.concat([pcr_unb,rapid_unb]).reset_index()
del both_data_unb['index']
both_data_unb.info()

In [None]:
def vizualize_class(x,y):
    sns.countplot(x=y)
#Unbalanced  dataset
vizualize_class(both_data_unb.iloc[:,:10],both_data_unb['Result (PCR / Rapids)'])

In [None]:
print('positive test',len(rapid_unb[rapid_unb['Result (PCR / Rapids)']==0]))
print('negative test',len(rapid_unb[rapid_unb['Result (PCR / Rapids)']==1]))

In [None]:
"""
teste pcr:
positive test 916
negative test 1863

teste rápido:
positive test 648
negative test 16594
"""

In [None]:
pcr_unb_state=data.loc[pcr_unb.index.values]
pcr_unb_state['Notification state'].value_counts()

In [None]:
rapid_unb_state=data.loc[rapid_unb.index.values]
rapid_unb_state['Notification state'].value_counts()

# Matching the classes!

In [None]:
def match_classes(dt_covid):
    y=dt_covid["Result (PCR / Rapids)"]

    x=dt_covid.iloc[:,:10]
    x['index']=dt_covid.index
    nr = NearMiss()
    x,y=nr.fit_sample(x,y)
    return x,y
x_pcr,y_pcr=match_classes(test_pcr)
x_rapid,y_rapid=match_classes(rapid_test)
x_rapid['Class']=y_rapid
data_rapid=x_rapid.copy()
x_pcr['Class']=y_pcr
data_pcr=x_pcr.copy()


In [None]:
pcr_state=data.loc[data_pcr['index'].values]
pcr_state['Notification state'].value_counts()

In [None]:
rapid_state=data.loc[data_rapid['index'].values]
rapid_state['Notification state'].value_counts()

In [None]:
#antibody
vizualize_class(x_rapid,y_rapid)

Createing both dataset !

In [None]:
both_data=pd.concat([data_pcr,data_rapid]).reset_index()
del both_data['index']
del data_pcr['index']
del data_rapid['index']

#both_data_gender=pd.concat([data_fe,data_ma]).reset_index()
#del both_data_gender['index']

Dataset informations!

In [None]:
both_data.info()

In [None]:
data_rapid.info()

In [None]:
data_pcr.info()

In [None]:
#export dataset
data_rapid.to_csv('rapid_balanced.csv',index=False)
data_pcr.to_csv('pcr_balanced.csv',index=False)
del both_data['level_0']
both_data.to_csv('both_test_balanced.csv',index=False)

rapid_unb['Class']=rapid_unb['Result (PCR / Rapids)'].values
del rapid_unb['Result (PCR / Rapids)']

pcr_unb['Class']=pcr_unb['Result (PCR / Rapids)'].values
del pcr_unb['Result (PCR / Rapids)']

both_data_unb['Class']=both_data_unb['Result (PCR / Rapids)'].values
del both_data_unb['Result (PCR / Rapids)']

rapid_unb.to_csv('rapid_unbalanced.csv',index=False)
pcr_unb.to_csv('pcr_unbalanced.csv',index=False)
both_data_unb.to_csv('both_test_unbalanced.csv',index=False)
