# Exploratory Data Analysis (EDA)

## Importing Libraries

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import glob
import qgrid

## Loading the Dataset

In [6]:
def load_csv_dataframes(path):
    '''Loads all csv files in a folder and stores them in a list of dataframes.
    Returns list of dataframes.
    [attributes]
    path: path of folder with csvs inside
    '''
    all_files = glob.glob(path + "/*.csv")
    dataframes_list=[]
    file_count=len(all_files)
    for i in range(file_count):
        temp_df = pd.read_csv(all_files[i])
        dataframes_list.append(temp_df)
        
    return dataframes_list

In [18]:
####1. Load relevant csv files. 
### INPUT -- folder_path for file

# Update folder_path to grab all csv files in folder
folder_path = r"Data\3. Full merge no nan\full_merge_no_nans.csv"

df= pd.read_csv(folder_path)

In [19]:
df

Unnamed: 0,country,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated","age population, age 01, female, interpolated","age population, age 01, male, interpolated","age population, age 02, female, interpolated","age population, age 02, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,mortality_rate_covid
0,afghanistan,86.954464,4.616073,82.338391,470554,498844,479012,505348,484823,509720,...,0.1664,0.1941,0.5546,0.0000,0.1387,40.2477,0.0000,1.1370,0.0462,4.352069
1,albania,44.806973,17.950766,26.856206,21438,23178,19177,20669,17406,18721,...,0.0000,0.8867,0.2635,0.0042,0.2677,22.2552,0.0084,3.2456,0.0544,1.717095
2,algeria,52.617579,9.062490,43.555089,459353,476141,460749,476982,456245,471814,...,0.1776,1.4638,0.4628,0.0000,0.2745,36.1694,0.0269,3.1267,0.1399,2.677922
3,angola,99.855751,4.623604,95.232147,502641,510719,485224,491924,468369,473922,...,0.0000,5.1941,0.1017,0.0092,0.0092,34.7782,0.0092,0.8133,0.0924,2.367660
4,antigua and barbuda,45.650381,10.434645,35.215736,734,744,727,736,722,730,...,0.3438,0.4666,0.4113,0.0000,0.0737,16.8161,0.0430,1.6024,0.2947,2.430556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,uzbekistan,49.650534,6.970327,42.680207,318305,332916,317447,331971,314294,328606,...,0.0160,1.2509,0.1604,0.0000,0.1657,29.9781,0.0053,3.9985,0.0321,0.785628
143,vanuatu,68.719678,7.084707,61.634972,3215,3455,3330,3566,3400,3634,...,0.1177,5.7347,0.0863,0.0000,0.0157,30.1247,0.0078,1.5847,0.1020,0.000000
144,vietnam,42.511440,9.605993,32.905446,733682,816544,737160,821987,736193,821086,...,0.5221,0.2333,0.5444,0.0056,0.3277,29.5617,0.0000,3.7216,0.0389,1.763224
145,zambia,95.402326,5.683539,89.718788,295178,301320,289170,294786,282804,287939,...,0.0846,1.2863,0.0762,0.0000,0.0000,40.1117,0.0000,0.8039,0.0592,1.365849


## Structured Based Data Exploration

<b> Displaying the first 5 rows

In [30]:
df.head()

Unnamed: 0,country,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated","age population, age 01, female, interpolated","age population, age 01, male, interpolated","age population, age 02, female, interpolated","age population, age 02, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,mortality_rate_covid
0,afghanistan,86.954464,4.616073,82.338391,470554,498844,479012,505348,484823,509720,...,0.1664,0.1941,0.5546,0.0,0.1387,40.2477,0.0,1.137,0.0462,4.352069
1,albania,44.806973,17.950766,26.856206,21438,23178,19177,20669,17406,18721,...,0.0,0.8867,0.2635,0.0042,0.2677,22.2552,0.0084,3.2456,0.0544,1.717095
2,algeria,52.617579,9.06249,43.555089,459353,476141,460749,476982,456245,471814,...,0.1776,1.4638,0.4628,0.0,0.2745,36.1694,0.0269,3.1267,0.1399,2.677922
3,angola,99.855751,4.623604,95.232147,502641,510719,485224,491924,468369,473922,...,0.0,5.1941,0.1017,0.0092,0.0092,34.7782,0.0092,0.8133,0.0924,2.36766
4,antigua and barbuda,45.650381,10.434645,35.215736,734,744,727,736,722,730,...,0.3438,0.4666,0.4113,0.0,0.0737,16.8161,0.043,1.6024,0.2947,2.430556


<b> Displaying the last 5 rows

In [21]:
df.tail()

Unnamed: 0,country,age dependency ratio (% of working-age population),"age dependency ratio, old","age dependency ratio, young","age population, age 0, female, interpolated","age population, age 0, male, interpolated","age population, age 01, female, interpolated","age population, age 01, male, interpolated","age population, age 02, female, interpolated","age population, age 02, male, interpolated",...,protein_spices,protein_starchy roots,protein_stimulants,protein_sugar & sweeteners,protein_treenuts,protein_vegetal products,protein_vegetable oils,protein_vegetables,protein_miscellaneous,mortality_rate_covid
142,uzbekistan,49.650534,6.970327,42.680207,318305,332916,317447,331971,314294,328606,...,0.016,1.2509,0.1604,0.0,0.1657,29.9781,0.0053,3.9985,0.0321,0.785628
143,vanuatu,68.719678,7.084707,61.634972,3215,3455,3330,3566,3400,3634,...,0.1177,5.7347,0.0863,0.0,0.0157,30.1247,0.0078,1.5847,0.102,0.0
144,vietnam,42.51144,9.605993,32.905446,733682,816544,737160,821987,736193,821086,...,0.5221,0.2333,0.5444,0.0056,0.3277,29.5617,0.0,3.7216,0.0389,1.763224
145,zambia,95.402326,5.683539,89.718788,295178,301320,289170,294786,282804,287939,...,0.0846,1.2863,0.0762,0.0,0.0,40.1117,0.0,0.8039,0.0592,1.365849
146,zimbabwe,80.391033,5.347744,75.043289,262428,264810,256385,258392,249923,251613,...,0.0488,0.5467,0.2636,0.0,0.1367,38.6508,0.0293,0.5955,0.0586,3.815931


<b> Displaying the Number of Variables and Observations

In [23]:
#In this Dataframe, there are 147 rows X 255 columns
df.shape

(147, 255)

<b> Display the Variable Names and Data Types

In [27]:
qgrid.show_grid(df.dtypes, grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Count the number of Non-Missing Values for each variable

In [29]:
qgrid.show_grid(df.count(), grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Descriptive Statistics

In [34]:
#Statistical information of all numerical features in the dataframe
qgrid.show_grid(df.describe(include = 'all'), grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

<b> Displaying the Complete Summary of the Dataset

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Columns: 255 entries, country to mortality_rate_covid
dtypes: float64(150), int64(104), object(1)
memory usage: 293.0+ KB


In [42]:
qgrid.show_grid(df.isnull().sum(), grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

# Data Visualizations

## Target Variable

In [43]:
plt.figure(figsize = (5,5))
sns.countplot(x=df['mortality_rate_covid']), palette = 'Greens', linewidth = 1)
plt.show()

SyntaxError: cannot assign to function call (3990097781.py, line 2)