# Exploratory Data Analysis (EDA)

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import glob
import qgrid

## Loading the Dataset

In [2]:
def load_csv_dataframes(path):
    '''Loads all csv files in a folder and stores them in a list of dataframes.
    Returns list of dataframes.
    [attributes]
    path: path of folder with csvs inside
    '''
    all_files = glob.glob(path + "/*.csv")
    dataframes_list=[]
    file_count=len(all_files)
    for i in range(file_count):
        temp_df = pd.read_csv(all_files[i])
        dataframes_list.append(temp_df)
        
    return dataframes_list

In [3]:
####1. Load relevant csv files. 
### INPUT -- folder_path for file

# Update folder_path to grab all csv files in folder
folder_path = r"C:\Users\Barbara\OneDrive\Documents\GitHub\IH_Berliners\Data\1. Files to merge\combined_data_merged.csv"

df= pd.read_csv(folder_path)#
#df.drop('Unnamed: 0', inplace=True, axis=1)

In [4]:
df

Unnamed: 0,country,adults (ages 15+) and children (0-14 years) living with hiv,adults (ages 15+) and children (ages 0-14) newly infected with hiv,adults (ages 15+) living with hiv,adults (ages 15+) newly infected with hiv,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,mortality_rate_covid
0,afghanistan,6900.0,1000.0,6700.0,1000.0,86.954464,4.616073,82.338391,470554.0,498844.0,...,0.1664,0.1941,0.5546,0.0000,0.1387,40.2477,0.0000,1.1370,0.0462,4.352069
1,albania,,,,,44.806973,17.950766,26.856206,21438.0,23178.0,...,0.0000,0.8867,0.2635,0.0042,0.2677,22.2552,0.0084,3.2456,0.0544,1.717095
2,algeria,8800.0,500.0,8600.0,500.0,52.617579,9.062490,43.555089,459353.0,476141.0,...,0.1776,1.4638,0.4628,0.0000,0.2745,36.1694,0.0269,3.1267,0.1399,2.677922
3,angola,320000.0,26000.0,290000.0,22000.0,99.855751,4.623604,95.232147,502641.0,510719.0,...,0.0000,5.1941,0.1017,0.0092,0.0092,34.7782,0.0092,0.8133,0.0924,2.367660
4,antigua and barbuda,,,,,45.650381,10.434645,35.215736,734.0,744.0,...,0.3438,0.4666,0.4113,0.0000,0.0737,16.8161,0.0430,1.6024,0.2947,2.430556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,venezuela,110000.0,5600.0,110000.0,5300.0,52.374932,9.549329,42.825603,289792.0,303680.0,...,0.0000,1.1195,0.3287,0.0178,0.0000,27.4545,0.0533,1.0129,0.0267,
159,vietnam,260000.0,14000.0,250000.0,14000.0,42.511440,9.605993,32.905446,733682.0,816544.0,...,0.5221,0.2333,0.5444,0.0056,0.3277,29.5617,0.0000,3.7216,0.0389,
160,yemen,9200.0,1000.0,8800.0,1000.0,75.595147,4.893864,70.701283,392662.0,409629.0,...,0.0657,0.2066,0.3193,0.0000,0.0188,39.9831,0.0000,0.5448,0.0564,
161,zambia,1200000.0,60000.0,1100000.0,55000.0,95.402326,5.683539,89.718788,295178.0,301320.0,...,0.0846,1.2863,0.0762,0.0000,0.0000,40.1117,0.0000,0.8039,0.0592,


## Structured Based <span class="mark">Data Exploration</span>

<b> Displaying the first 5 rows

In [5]:
df.head()

Unnamed: 0,country,adults (ages 15+) and children (0-14 years) living with hiv,adults (ages 15+) and children (ages 0-14) newly infected with hiv,adults (ages 15+) living with hiv,adults (ages 15+) newly infected with hiv,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,mortality_rate_covid
0,afghanistan,6900.0,1000.0,6700.0,1000.0,86.954464,4.616073,82.338391,470554.0,498844.0,...,0.1664,0.1941,0.5546,0.0,0.1387,40.2477,0.0,1.137,0.0462,4.352069
1,albania,,,,,44.806973,17.950766,26.856206,21438.0,23178.0,...,0.0,0.8867,0.2635,0.0042,0.2677,22.2552,0.0084,3.2456,0.0544,1.717095
2,algeria,8800.0,500.0,8600.0,500.0,52.617579,9.06249,43.555089,459353.0,476141.0,...,0.1776,1.4638,0.4628,0.0,0.2745,36.1694,0.0269,3.1267,0.1399,2.677922
3,angola,320000.0,26000.0,290000.0,22000.0,99.855751,4.623604,95.232147,502641.0,510719.0,...,0.0,5.1941,0.1017,0.0092,0.0092,34.7782,0.0092,0.8133,0.0924,2.36766
4,antigua and barbuda,,,,,45.650381,10.434645,35.215736,734.0,744.0,...,0.3438,0.4666,0.4113,0.0,0.0737,16.8161,0.043,1.6024,0.2947,2.430556


<b> Displaying the last 5 rows

In [6]:
df.tail()

Unnamed: 0,country,adults (ages 15+) and children (0-14 years) living with hiv,adults (ages 15+) and children (ages 0-14) newly infected with hiv,adults (ages 15+) living with hiv,adults (ages 15+) newly infected with hiv,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,mortality_rate_covid
158,venezuela,110000.0,5600.0,110000.0,5300.0,52.374932,9.549329,42.825603,289792.0,303680.0,...,0.0,1.1195,0.3287,0.0178,0.0,27.4545,0.0533,1.0129,0.0267,
159,vietnam,260000.0,14000.0,250000.0,14000.0,42.51144,9.605993,32.905446,733682.0,816544.0,...,0.5221,0.2333,0.5444,0.0056,0.3277,29.5617,0.0,3.7216,0.0389,
160,yemen,9200.0,1000.0,8800.0,1000.0,75.595147,4.893864,70.701283,392662.0,409629.0,...,0.0657,0.2066,0.3193,0.0,0.0188,39.9831,0.0,0.5448,0.0564,
161,zambia,1200000.0,60000.0,1100000.0,55000.0,95.402326,5.683539,89.718788,295178.0,301320.0,...,0.0846,1.2863,0.0762,0.0,0.0,40.1117,0.0,0.8039,0.0592,
162,zimbabwe,1400000.0,64000.0,1300000.0,59000.0,80.391033,5.347744,75.043289,262428.0,264810.0,...,0.0488,0.5467,0.2636,0.0,0.1367,38.6508,0.0293,0.5955,0.0586,


<b> Displaying the Number of Variables and Observations

In [7]:
#In this Dataframe, there are 162 rows X 256 columns
df.shape

(163, 277)

<b> Display the Variable Names and Data Types

In [8]:
qgrid.show_grid(df.dtypes, grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Count the number of Non-Missing Values for each variable

In [9]:
qgrid.show_grid(df.count(), grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Descriptive Statistics

In [10]:
#Statistical information of all numerical features in the dataframe
qgrid.show_grid(df.describe(), grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Displaying the Complete Summary of the Dataset

In [11]:
qgrid.show_grid(df.info(verbose = True), grid_options={'forceFitColumns': False})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 277 columns):
 #    Column                                                                           Dtype  
---   ------                                                                           -----  
 0    country                                                                          object 
 1    adults (ages 15+) and children (0-14 years) living with hiv                      float64
 2    adults (ages 15+) and children (ages 0-14) newly infected with hiv               float64
 3    adults (ages 15+) living with hiv                                                float64
 4    adults (ages 15+) newly infected with hiv                                        float64
 5    age dependency ratio (% of working-age population)                               float64
 6    age dependency ratio, old                                                        float64
 7    age dependency ratio, young      

TypeError: data_frame must be DataFrame or Series, not <class 'NoneType'>

<b> Null vaues

In [None]:
qgrid.show_grid(df.isnull().sum(), grid_options={'forceFitColumns': False})

# Data Visualizations

## Target Variable

In [None]:
plt.scatter(df.index,df['mortality_rate_covid'])
plt.title("Univariate scatter plot")
plt.xlabel("Observation number")
plt.ylabel("mortality_rate_covid")
plt.show()

## Correlation matrix

### Deleting correlated variables (90% treshold)

In [None]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr().abs()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

    return dataset

In [None]:
corr90 = correlation(df, 0.9)

<b> Dataframe with dropped columns

In [None]:
corr90

In [None]:
#corr90.columns

### Correlation matrix

In [None]:
corr_matrix=df.corr()

qgrid.show_grid(corr_matrix.abs(), grid_options={'forceFitColumns': False})

In [None]:
plt.figure(figsize=(9, 8))
sns.distplot(df['mortality_rate_covid'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(40, 35), bins=50, xlabelsize=15, ylabelsize=10);

In [None]:
numeric=df._get_numeric_data()
col_names=numeric.columns
for i in col_names:
    plt.figure(figsize=(9,5))
    sns.boxplot(x=i,data=df)

<b> Removing outliers

In [None]:
def remove_outlier_IQR(df):
    Q1=df.quantile(0.25)
    Q3=df.quantile(0.75)
    IQR=Q3-Q1
    df_final=df[~((df<(Q1-1.5*IQR)) | (df>(Q3+1.5*IQR)))]
    return df_final

In [None]:
df_without_outliers = remove_outlier_IQR(numeric)
df_without_outliers

<b> Replaicng NANs with mean and multuple columns

In [None]:
column_means = numeric.mean()
df = df_without_outliers.fillna(column_means)

### Boxplots with data cleaned

In [None]:
df=df._get_numeric_data()
col_names=numeric.columns
for i in col_names:
    plt.figure(figsize=(9,5))
    sns.boxplot(x=i,data=df)

### Scatter plots with Cleaned Dataframe

In [None]:
df=df._get_numeric_data()
col_names=numeric.columns
for i in col_names:
    plt.figure(figsize=(9,5))
    sns.scatterplot(data=df, x=i,y=df['covid_deaths'] )

age population, age 0, female, interpolated
inmunization, dpt(%children ages 12-23 months) ???
inmunization, hib3(%children ages 12-23 months) 
inmunization, measles(%children ages 12-23 months) 

In [None]:
####1. Load relevant csv files. 
### INPUT -- folder_path for file
# Update folder_path to grab all csv files in folder
folder_path = r"Data\\6.gni and comparative death rates\nutrition_demographic_covid_continent_3.csv"
df1= pd.read_csv(folder_path)

In [None]:
gni_higher = df1[df1['gni_real']== 'high_gni']
df2=gni_higher._get_numeric_data()
col_names=numeric.columns
for i in col_names:
    plt.figure(figsize=(9,5))
    sns.scatterplot(data=df2, x=i,y=df2['covid_deaths'] )

In [None]:
gni_higher = df1[df1['gni_real']== 'low_gni']
df2=gni_higher._get_numeric_data()
col_names=numeric.columns
for i in col_names:
    plt.figure(figsize=(9,5))
    sns.scatterplot(data=df2, x=i,y=df2['covid_deaths'] )

In [None]:
####1. Load relevant csv files. 
### INPUT -- folder_path for file
# Update folder_path to grab all csv files in folder
folder_path = r"C:\Users\Barbara\OneDrive\Documents\GitHub\IH_Berliners\Data\4. Updated Covid numbers\nutrition_demographic_covid.csv"
df3= pd.read_csv(folder_path)


In [None]:
df4=df3._get_numeric_data()
col_names=df4.columns
for i in col_names:
    plt.figure(figsize=(9,5))
    sns.scatterplot(data=df4, x=i,y=df4['covid_fatality_rate'] )

In [None]:

df4.columns