In [1]:
# Importing Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


table_full = pd.read_csv('table_full.csv')


In [2]:
def draw_graphs(col, y_max, graph_type, figSize):
    
    table_full[col].plot(kind=graph_type, figsize=figSize,title=col, grid=True)
    
    if graph_type == 'density' or graph_type == 'kde':
        plt.vlines(table_full[col].mean(),
                      ymin=0.0, ymax=y_max,
                      colors='red')

        plt.vlines(table_full[col].median(),
                     ymin=0.0, ymax=y_max,
                      colors='green')
            
    plt.savefig('figures/' + col + '.pdf')
    plt.close()

    
def draw_graphs_pairs(col1, col2, graph_type, figSize):
    table_full[[col1, col2]].plot(kind=graph_type, title=col1 + ' and '+ col2, figsize=figSize, grid=True)
    
    plt.savefig('figures/' + col1 + '_' + col2 + '.pdf')
    plt.close()
    




draw_graphs('total_cases', 4.3e-7, 'density', (3,3))
draw_graphs('total_deaths', 2.3e-5, 'density', (3,3))
draw_graphs('pop2020', 4.6e-6, 'density', (3,3))

cols = ['median_age', 'total_cases', 'total_deaths', 'current_health_expenditure_average', 'life_expectancy', 'extreme_poverty', 'pop2020']
cols_pairs = ['male_smokers', 'female_smokers']

for i in cols:
    draw_graphs(i, 1, 'box', (3,3))

In [3]:
from scipy.stats import pearsonr
from scipy.stats import mannwhitneyu
from sklearn.preprocessing import RobustScaler

# Creting Tests which are Pearson test for correlation and mannWhitenyU test for looking statistical differences between columns 
# and columns has outliars which we have several of them. Functions returns Dataframe list of all p values in all cols.

def look_pearson(col):
    test_results = pd.DataFrame(columns=['Column', 'Correlation', 'p value'])
    
    for i in list(table_full.columns.values):
        if i != 'countries' and i != 'region' and i != col:
            res = pearsonr(table_full[col], table_full[i])
            test_results = test_results.append({'Column': i, 'Correlation': res[0], 'p value': res[1]}, ignore_index=True)
    
    return test_results

def look_mannwhitney(col):
    test_results = pd.DataFrame(columns=['Column', 'Stats', 'p value'])
    robustScaler = RobustScaler()

    for i in list(table_full.columns.values):
        if i != 'countries' and i != 'region' and i != col:
            # Just in case use robust scaler if varibles has different scale between and has outliars
            df = table_full[[col, i]]
            data_df = robustScaler.fit_transform(df)
            df = pd.DataFrame(data_df, columns=[col, i])

            res = mannwhitneyu(df[col], df[i])
            test_results = test_results.append({'Column': i, 'Stats': res[0], 'p value': res[1]}, ignore_index=True)
            
    return test_results


In [17]:
df_mannwhitney = look_mannwhitney('total_cases_per_million')
df_pearson = look_pearson('total_cases_per_million')

df_pearson[(df_pearson['Correlation'] > 0.5)]

Unnamed: 0,Column,Correlation,p value
3,total_deaths_per_million,0.782718,1.591573e-22
6,median_age,0.574104,2.299061e-10
7,aged_65_older,0.534326,6.136034e-09
8,aged_70_older,0.543233,3.051567e-09
12,female_smokers,0.601112,1.900498e-11
15,life_expectancy,0.570549,3.139003e-10
16,human_development_index,0.604087,1.423482e-11
23,gdpPerCapita,0.50988,3.774056e-08
29,hf_score,0.533204,6.690819e-09
34,pf_rol,0.504661,5.462135e-08


In [38]:

df_mannwhitney = look_mannwhitney('total_cases')
df_pearson = look_pearson('total_cases')

print(df_mannwhitney[(df_mannwhitney['Column'] == 'current_health_expenditure_average') | (df_mannwhitney['Column'] == 'extreme_poverty')].to_latex())
print('')
print(df_pearson[(df_pearson['Column'] == 'current_health_expenditure_average') | (df_pearson['Column'] == 'extreme_poverty')].to_latex())

df_mannwhitney = look_mannwhitney('life_expectancy')
df_pearson = look_pearson('life_expectancy')

print(df_mannwhitney[(df_mannwhitney['Column'] == 'current_health_expenditure_average')].to_latex())
print('')
print(df_pearson[(df_pearson['Column'] == 'current_health_expenditure_average')].to_latex())

\begin{tabular}{llrr}
\toprule
{} &                              Column &   Stats &   p value \\
\midrule
9  &                     extreme\_poverty &  4435.0 &  0.021070 \\
28 &  current\_health\_expenditure\_average &  4940.5 &  0.197742 \\
\bottomrule
\end{tabular}


\begin{tabular}{llrr}
\toprule
{} &                              Column &  Correlation &   p value \\
\midrule
9  &                     extreme\_poverty &    -0.085089 &  0.392789 \\
28 &  current\_health\_expenditure\_average &     0.280074 &  0.004166 \\
\bottomrule
\end{tabular}

\begin{tabular}{llrr}
\toprule
{} &                              Column &   Stats &   p value \\
\midrule
28 &  current\_health\_expenditure\_average &  3867.5 &  0.000393 \\
\bottomrule
\end{tabular}


\begin{tabular}{llrr}
\toprule
{} &                              Column &  Correlation &       p value \\
\midrule
28 &  current\_health\_expenditure\_average &       0.6336 &  6.830093e-13 \\
\bottomrule
\end{tabular}



In [31]:
table_full = pd.read_csv('table_full.csv')

progRank = table_full['Progression RANK'].values

from sklearn.preprocessing import KBinsDiscretizer
# Creating Y
est = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='uniform')
progreesinRank = progRank.reshape(-1,1)
est.fit(progreesinRank)
progRank = est.transform(progreesinRank).astype(np.int64)

In [38]:
from scipy.stats import ttest_ind

for col in table_full.columns:
    stat, p = ttest_ind(progRank, table_full[col].values)
    print(col, ' : ', p)


Unnamed: 0  :  [1.52114184e-41]
total_cases  :  [0.00144638]
total_deaths  :  [0.00024713]
total_cases_per_million  :  [1.97835589e-20]
total_deaths_per_million  :  [3.56232805e-17]
reproduction_rate  :  [1.18976216e-30]
stringency_index  :  [4.43296321e-114]
median_age  :  [6.28112058e-91]
aged_65_older  :  [1.33137916e-37]
aged_70_older  :  [4.87961692e-33]
extreme_poverty  :  [2.08851749e-07]
cardiovasc_death_rate  :  [1.47892747e-52]
diabetes_prevalence  :  [2.47977313e-46]
female_smokers  :  [9.66241213e-22]
male_smokers  :  [8.47345334e-63]
hospital_beds_per_thousand  :  [2.52472243e-22]
life_expectancy  :  [5.63474911e-182]
human_development_index  :  [4.01080163e-12]
Score A  :  [2.55054162e-54]
Sco Exa  :  [9.14029479e-12]
Score 2020  :  [5.65550701e-55]
Progression RANK  :  [0.38171788]
imfGDP  :  [0.00224232]
unGDP  :  [0.00202197]
gdpPerCapita  :  [9.17261146e-15]
pop2020  :  [0.00127071]
area  :  [5.39927253e-05]
Density  :  [0.00303978]
GrowthRate  :  [5.62548393e-29]
cur

TypeError: unsupported operand type(s) for /: 'str' and 'int'