# <font color='Crimson'><b>CORRELATIONS</b></font>

#### <font color='indianred'><b>ON TRAINING SET</b></font>

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency
from scipy.stats import kruskal
import seaborn as sns

In [None]:
X_training = pd.read_csv('03_X_training.csv',sep=',',index_col=0)
X_training.head()

In [None]:
y_training = pd.read_csv('03_y_training.csv',sep=',',index_col=0)
y_training.head()

In [None]:
#Merge X_training (useful features) and y_training in training set:
training = X_training.drop(X_training.columns[[0,3,4,5,8]], axis=1)
training['CANDIDEMIA'] = y_training
training.head()

In [None]:
#Correlations between the (categorical) outcome:
training_copy = training.copy()

In [None]:
#And categorical features:
training_copy['SESSO'] = training_copy['SESSO'].astype('category')
training_copy['30gg'] = training_copy['30gg'].astype('category')
training_copy['CANDIDEMIA'] = training_copy['CANDIDEMIA'].astype('category')

In [None]:
#Contingency table:
pd.crosstab(training_copy['SESSO'], training_copy['CANDIDEMIA'])

In [None]:
#Chi-square test:
print('Candidemia and gender: p = ', chi2_contingency(pd.crosstab(training_copy['SESSO'], training_copy['CANDIDEMIA'])).pvalue)

In [None]:
#Contingency table:
pd.crosstab(training_copy['30gg'], training_copy['CANDIDEMIA'])

In [None]:
#Chi-square test:
print('Candidemia and previous colonization: p = ', chi2_contingency(pd.crosstab(training_copy['30gg'], training_copy['CANDIDEMIA'])).pvalue)

In [None]:
#And coninuous features:
candidemia = training[training['CANDIDEMIA'] == 1.0]
bacteraemia = training[training['CANDIDEMIA'] == 0.0]

In [None]:
#Kruskal-Wallis test:
print('Candidemia and age: p = ',kruskal(candidemia['ETA'],bacteraemia['ETA']).pvalue)
print('Candidemia and basophils: p = ',kruskal(candidemia['BASOFILI'],bacteraemia['BASOFILI'],nan_policy='omit').pvalue)
print('Candidemia and eosinophils: p = ',kruskal(candidemia['EOSINOFILI'],bacteraemia['EOSINOFILI'],nan_policy='omit').pvalue)
print('Candidemia and linfocites: p = ',kruskal(candidemia['LINFOCITI'],bacteraemia['LINFOCITI'],nan_policy='omit').pvalue)
print('Candidemia and monocytes: p = ',kruskal(candidemia['MONOCITI'],bacteraemia['MONOCITI'],nan_policy='omit').pvalue)
print('Candidemia and neutrophils: p = ',kruskal(candidemia['NEUTROFILI'],bacteraemia['NEUTROFILI'],nan_policy='omit').pvalue)
print('Candidemia and hematocrit: p = ',kruskal(candidemia['EMATOCRITO'],bacteraemia['EMATOCRITO'],nan_policy='omit').pvalue)
print('Candidemia and hemoglobin: p = ',kruskal(candidemia['EMOGLOBINA'],bacteraemia['EMOGLOBINA'],nan_policy='omit').pvalue)
print('Candidemia and white blood cells: p = ',kruskal(candidemia['GLOBULI_B'],bacteraemia['GLOBULI_B'],nan_policy='omit').pvalue)
print('Candidemia and red globides: p = ',kruskal(candidemia['GLOBULI_R'],bacteraemia['GLOBULI_R'],nan_policy='omit').pvalue)
print('Candidemia and platelets: p = ',kruskal(candidemia['PIASTRINE'],bacteraemia['PIASTRINE'],nan_policy='omit').pvalue)
print('Candidemia and aPTT: p = ',kruskal(candidemia['APTT'],bacteraemia['APTT'],nan_policy='omit').pvalue)
print('Candidemia and INR: p = ',kruskal(candidemia['INR'],bacteraemia['INR'],nan_policy='omit').pvalue)
print('Candidemia and prothrombin: p = ',kruskal(candidemia['TEMPO_PROTROMB'],bacteraemia['TEMPO_PROTROMB'],nan_policy='omit').pvalue)
print('Candidemia and uric acid: p = ',kruskal(candidemia['ACIDO_URICO'],bacteraemia['ACIDO_URICO'],nan_policy='omit').pvalue)
print('Candidemia and ALP: p = ',kruskal(candidemia['ALP'],bacteraemia['ALP'],nan_policy='omit').pvalue)
print('Candidemia and ALT: p = ',kruskal(candidemia['ALT'],bacteraemia['ALT'],nan_policy='omit').pvalue)
print('Candidemia and AST: p = ',kruskal(candidemia['AST'],bacteraemia['AST'],nan_policy='omit').pvalue)
print('Candidemia and total bilirubin: p = ',kruskal(candidemia['BILIRUBINA_TOT'],bacteraemia['BILIRUBINA_TOT'],nan_policy='omit').pvalue)
print('Candidemia and creatinine: p = ',kruskal(candidemia['CREATININA'],bacteraemia['CREATININA'],nan_policy='omit').pvalue)
print('Candidemia and GGT: p = ',kruskal(candidemia['GGT'],bacteraemia['GGT'],nan_policy='omit').pvalue)
print('Candidemia and LAD: p = ',kruskal(candidemia['LAD'],bacteraemia['LAD'],nan_policy='omit').pvalue)
print('Candidemia and urea: p = ',kruskal(candidemia['UREA'],bacteraemia['UREA'],nan_policy='omit').pvalue)
print('Candidemia and glucose: p = ',kruskal(candidemia['GLUCOSIO'],bacteraemia['GLUCOSIO'],nan_policy='omit').pvalue)
print('Candidemia and albumin: p = ',kruskal(candidemia['ALBUMINA'],bacteraemia['ALBUMINA'],nan_policy='omit').pvalue)
print('Candidemia and PCR: p = ',kruskal(candidemia['PCR'],bacteraemia['PCR'],nan_policy='omit').pvalue)
print('Candidemia and total proteins: p = ',kruskal(candidemia['PROTEINE_TOT'],bacteraemia['PROTEINE_TOT'],nan_policy='omit').pvalue)

In [None]:
#Correlations between categorical features:
#Contingency table:
pd.crosstab(training_copy['SESSO'], training_copy['30gg'])

In [None]:
#Chi-square test:
print('Gender and previous colonization: p = ', chi2_contingency(pd.crosstab(training_copy['SESSO'], training_copy['30gg'])).pvalue)

In [None]:
#Correlations between continuous features:
training_continuous = training.drop(training.columns[[0,2,3,30]], axis=1)
#Correlation matrix:
rounded_corr_matrix = training_continuous.corr().round(2)
heatmap = sns.heatmap(rounded_corr_matrix, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)

#### <font color='indianred'><b>WITH BDG AND PCT</b></font>

In [None]:
df_complete1 = pd.read_csv('02_1_Dataset.csv', sep=',',index_col=0)
df_complete1.head()

In [None]:
#Add to the training set the correspondent values of BDG and PCT:
df_complete1_BDG_PCT = df_complete1.iloc[:,[40,42]]
df_complete1_training = df_complete1_BDG_PCT.merge(training, how='right',right_index=True,left_index=True)
df_complete1_training.head()

In [None]:
#And continuous features:
df_complete1_training_copy = df_complete1_training.copy()
df_complete1_training_continuous = df_complete1_training_copy.drop(df_complete1_training_copy.columns[[2,4,5,32]], axis=1)
#Correlation matrix:
rounded_corr_matrix_bis = df_complete1_training_continuous.corr().round(2)
print('Continuous features and BGD: rho = \n',rounded_corr_matrix_bis["B_D_GLUCANO"].sort_values(ascending=False))
print('Continuous features and PCT: rho = \n',rounded_corr_matrix_bis["PROCALCITONINA"].sort_values(ascending=False))

In [None]:
#And categorical features:
males = df_complete1_training_copy[df_complete1_training_copy['SESSO'] == 1.0]
females = df_complete1_training_copy[df_complete1_training_copy['SESSO'] == 0.0]
previous_candida = df_complete1_training_copy[df_complete1_training_copy['30gg'] == 1.0]
non_previous_candida = df_complete1_training_copy[df_complete1_training_copy['30gg'] == 0.0]
candidemia = df_complete1_training_copy[df_complete1_training_copy['CANDIDEMIA'] == 1.0]
bacteraemia = df_complete1_training_copy[df_complete1_training_copy['CANDIDEMIA'] == 0.0]
#Kruskal-Wallis test:
print('BGD and gender: p = ',kruskal(males['B_D_GLUCANO'],females['B_D_GLUCANO'],nan_policy='omit').pvalue)
print('PCT and gender: p = ',kruskal(males['PROCALCITONINA'],females['PROCALCITONINA'],nan_policy='omit').pvalue)
print('BGD and previous colonization: p = ',kruskal(previous_candida['B_D_GLUCANO'],non_previous_candida['B_D_GLUCANO'],nan_policy='omit').pvalue)
print('PCT and previous colonization: p = ',kruskal(previous_candida['PROCALCITONINA'],non_previous_candida['PROCALCITONINA'],nan_policy='omit').pvalue)
print('BGD and candidemia: p = ',kruskal(candidemia['B_D_GLUCANO'],bacteraemia['B_D_GLUCANO'],nan_policy='omit').pvalue)
print('PCT and candidemia: p = ',kruskal(candidemia['PROCALCITONINA'],bacteraemia['PROCALCITONINA'],nan_policy='omit').pvalue)

In [None]:
#No data set needs to be exported for following analysis