# Exploratory Data Analysis (EDA)

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import glob
import qgrid

## Loading the Dataset

In [2]:
def load_csv_dataframes(path):
    '''Loads all csv files in a folder and stores them in a list of dataframes.
    Returns list of dataframes.
    [attributes]
    path: path of folder with csvs inside
    '''
    all_files = glob.glob(path + "/*.csv")
    dataframes_list=[]
    file_count=len(all_files)
    for i in range(file_count):
        temp_df = pd.read_csv(all_files[i])
        dataframes_list.append(temp_df)
        
    return dataframes_list

In [3]:
####1. Load relevant csv files. 
### INPUT -- folder_path for file

# Update folder_path to grab all csv files in folder
folder_path = r"Data\\5. Updated with continent codes\nutrition_demographic_covid_continent.csv"

df= pd.read_csv(folder_path)

In [4]:
df

Unnamed: 0,continent,country,covid_confirmed_cases,covid_deaths,covid_fatality_rate,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,gni_category
0,asia,afghanistan,158084,7356,0.000226,86.954464,4.616073,82.338391,470554.0,498844.0,...,0.1664,0.1941,0.5546,0.0000,0.1387,40.2477,0.0000,1.1370,0.0462,low_gni
1,europe,albania,210224,3217,0.001113,44.806973,17.950766,26.856206,21438.0,23178.0,...,0.0000,0.8867,0.2635,0.0042,0.2677,22.2552,0.0084,3.2456,0.0544,mid_gni
2,africa,algeria,218432,6276,0.000158,52.617579,9.062490,43.555089,459353.0,476141.0,...,0.1776,1.4638,0.4628,0.0000,0.2745,36.1694,0.0269,3.1267,0.1399,mid_gni
3,africa,angola,81593,1770,0.000071,99.855751,4.623604,95.232147,502641.0,510719.0,...,0.0000,5.1941,0.1017,0.0092,0.0092,34.7782,0.0092,0.8133,0.0924,mid_gni
4,north america,antigua and barbuda,4283,119,0.001296,45.650381,10.434645,35.215736,734.0,744.0,...,0.3438,0.4666,0.4113,0.0000,0.0737,16.8161,0.0430,1.6024,0.2947,mid_gni
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,asia,yemen,10126,1984,0.000074,75.595147,4.893864,70.701283,392662.0,409629.0,...,0.0657,0.2066,0.3193,0.0000,0.0188,39.9831,0.0000,0.5448,0.0564,high_gni
158,africa,zambia,254274,3734,0.000230,95.402326,5.683539,89.718788,295178.0,301320.0,...,0.0846,1.2863,0.0762,0.0000,0.0000,40.1117,0.0000,0.8039,0.0592,low_gni
159,africa,zimbabwe,213258,5004,0.000321,80.391033,5.347744,75.043289,262428.0,264810.0,...,0.0488,0.5467,0.2636,0.0000,0.1367,38.6508,0.0293,0.5955,0.0586,low_gni
160,oceania,new caledonia,63515,313,0.001147,47.920177,15.042344,32.877833,1990.0,2053.0,...,0.0967,0.7431,0.4591,0.0000,0.1087,20.8240,0.0362,1.7640,1.1841,high_gni


## Structured Based Data Exploration

<b> Displaying the first 5 rows

In [5]:
df.head()

Unnamed: 0,continent,country,covid_confirmed_cases,covid_deaths,covid_fatality_rate,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,gni_category
0,asia,afghanistan,158084,7356,0.000226,86.954464,4.616073,82.338391,470554.0,498844.0,...,0.1664,0.1941,0.5546,0.0,0.1387,40.2477,0.0,1.137,0.0462,low_gni
1,europe,albania,210224,3217,0.001113,44.806973,17.950766,26.856206,21438.0,23178.0,...,0.0,0.8867,0.2635,0.0042,0.2677,22.2552,0.0084,3.2456,0.0544,mid_gni
2,africa,algeria,218432,6276,0.000158,52.617579,9.06249,43.555089,459353.0,476141.0,...,0.1776,1.4638,0.4628,0.0,0.2745,36.1694,0.0269,3.1267,0.1399,mid_gni
3,africa,angola,81593,1770,7.1e-05,99.855751,4.623604,95.232147,502641.0,510719.0,...,0.0,5.1941,0.1017,0.0092,0.0092,34.7782,0.0092,0.8133,0.0924,mid_gni
4,north america,antigua and barbuda,4283,119,0.001296,45.650381,10.434645,35.215736,734.0,744.0,...,0.3438,0.4666,0.4113,0.0,0.0737,16.8161,0.043,1.6024,0.2947,mid_gni


<b> Displaying the last 5 rows

In [6]:
df.tail()

Unnamed: 0,continent,country,covid_confirmed_cases,covid_deaths,covid_fatality_rate,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,gni_category
157,asia,yemen,10126,1984,7.4e-05,75.595147,4.893864,70.701283,392662.0,409629.0,...,0.0657,0.2066,0.3193,0.0,0.0188,39.9831,0.0,0.5448,0.0564,high_gni
158,africa,zambia,254274,3734,0.00023,95.402326,5.683539,89.718788,295178.0,301320.0,...,0.0846,1.2863,0.0762,0.0,0.0,40.1117,0.0,0.8039,0.0592,low_gni
159,africa,zimbabwe,213258,5004,0.000321,80.391033,5.347744,75.043289,262428.0,264810.0,...,0.0488,0.5467,0.2636,0.0,0.1367,38.6508,0.0293,0.5955,0.0586,low_gni
160,oceania,new caledonia,63515,313,0.001147,47.920177,15.042344,32.877833,1990.0,2053.0,...,0.0967,0.7431,0.4591,0.0,0.1087,20.824,0.0362,1.764,1.1841,high_gni
161,oceania,french polynesia,732268,649,0.002295,42.229689,10.738501,31.491187,2952.0,3000.0,...,0.0255,0.9036,0.3471,0.0,0.1174,17.0044,0.0153,1.2558,1.1129,high_gni


<b> Displaying the Number of Variables and Observations

In [7]:
#In this Dataframe, there are 162 rows X 256 columns
df.shape

(162, 256)

<b> Display the Variable Names and Data Types

In [8]:
qgrid.show_grid(df.dtypes, grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Count the number of Non-Missing Values for each variable

In [9]:
qgrid.show_grid(df.count(), grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Descriptive Statistics

In [10]:
#Statistical information of all numerical features in the dataframe
qgrid.show_grid(df.describe(), grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Displaying the Complete Summary of the Dataset

In [11]:
qgrid.show_grid(df.info(verbose = True), grid_options={'forceFitColumns': False})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 256 columns):
 #    Column                                                                     Dtype  
---   ------                                                                     -----  
 0    continent                                                                  object 
 1    country                                                                    object 
 2    covid_confirmed_cases                                                      int64  
 3    covid_deaths                                                               int64  
 4    covid_fatality_rate                                                        float64
 5    age dependency ratio (% of working-age population)                         float64
 6    age dependency ratio, old                                                  float64
 7    age dependency ratio, young                                                float64
 8  

TypeError: data_frame must be DataFrame or Series, not <class 'NoneType'>

<b> Null vaues

In [None]:
qgrid.show_grid(df.isnull().sum(), grid_options={'forceFitColumns': False})

# Data Visualizations

## Target Variable

In [None]:
plt.scatter(df.index,df['covid_fatality_rate'])
plt.title("Univariate scatter plot")
plt.xlabel("Observation number")
plt.ylabel("Covid fatality rate")
plt.show()

## Correlation matrix

### Deleting correlated variables (90% treshold)

In [None]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr().abs()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

    return dataset

In [None]:
corr90 = correlation(df, 0.9)

<b> Dataframe with dropped columns

In [None]:
corr90

In [None]:
#corr90.columns

### Correlation matrix

In [None]:
corr_matrix=df.corr()

qgrid.show_grid(corr_matrix.abs(), grid_options={'forceFitColumns': False})

In [None]:
plt.figure(figsize=(9, 8))
sns.distplot(df['covid_fatality_rate'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(40, 35), bins=50, xlabelsize=15, ylabelsize=10);

In [None]:
df_num=df.select_dtypes(include=['float64','int64'])
df_num
col_name=df_num.columns
for i in col_name:
    plt.figure(figsize=(8,5))
    sns.scatterplot(x='covid_deaths',y=i, data=df,x_bins=1000)

In [None]:
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(40, 30))
    ax = sns.heatmap(corr, mask=mask,cmap='coolwarm', vmin=-1,vmax=1,annot=True, square=True)

In [None]:
# Create the plot
fig, ax = plt.subplots()

# Plot the histogram with hist() function
ax.hist(df, edgecolor = "black", bins = 5)

# Label axes and set title
ax.set_title("Title")
ax.set_xlabel("X_Label")
ax.set_ylabel("Y_Label")