In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
from src.eda.load_gold_data import load_gold_happiness_data
from src.eda.explore_gold_data import EDAExplorer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
gold_df = load_gold_happiness_data()
display(gold_df.head())

Unnamed: 0,country_name,freedom_to_make_life_choices,generosity,healthy_life_expectancy,ladder_score,logged_gdp_per_capita,perceptions_of_corruption,regional_indicator,social_support,year,country,latitude,longitude
0,Afghanistan,0.718,0.168,50.8,3.724,7.37,0.882,South Asia,0.451,2008,AF,33.93911,67.709953
1,Afghanistan,0.679,0.19,51.2,4.402,7.54,0.85,South Asia,0.552,2009,AF,33.93911,67.709953
2,Afghanistan,0.6,0.121,51.6,4.758,7.647,0.707,South Asia,0.539,2010,AF,33.93911,67.709953
3,Afghanistan,0.496,0.162,51.92,3.832,7.62,0.731,South Asia,0.521,2011,AF,33.93911,67.709953
4,Afghanistan,0.531,0.236,52.24,3.783,7.705,0.776,South Asia,0.521,2012,AF,33.93911,67.709953


In [4]:
def inspect_df(df: pd.DataFrame = gold_df) -> None:
    
    display(df.info(memory_usage='deep'))
    
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype("category")
    for col in df.select_dtypes(include="int64").columns:
        df[col] = df[col].astype("int32")
    for col in df.select_dtypes(include="float64").columns:
        df[col] = df[col].astype("float32")
    
    display(df.info(memory_usage='deep'))

    display(df.describe())

inspect_df()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2027 entries, 0 to 2026
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country_name                  2027 non-null   object 
 1   freedom_to_make_life_choices  2027 non-null   float64
 2   generosity                    2027 non-null   float64
 3   healthy_life_expectancy       2002 non-null   float64
 4   ladder_score                  2027 non-null   float64
 5   logged_gdp_per_capita         2027 non-null   float64
 6   perceptions_of_corruption     2001 non-null   float64
 7   regional_indicator            2027 non-null   object 
 8   social_support                2027 non-null   float64
 9   year                          2027 non-null   int64  
 10  country                       2020 non-null   object 
 11  latitude                      2027 non-null   float64
 12  longitude                     2027 non-null   float64
dtypes: 

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2027 entries, 0 to 2026
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   country_name                  2027 non-null   category
 1   freedom_to_make_life_choices  2027 non-null   float32 
 2   generosity                    2027 non-null   float32 
 3   healthy_life_expectancy       2002 non-null   float32 
 4   ladder_score                  2027 non-null   float32 
 5   logged_gdp_per_capita         2027 non-null   float32 
 6   perceptions_of_corruption     2001 non-null   float32 
 7   regional_indicator            2027 non-null   category
 8   social_support                2027 non-null   float32 
 9   year                          2027 non-null   int32   
 10  country                       2020 non-null   category
 11  latitude                      2027 non-null   float32 
 12  longitude                     2027 non-null   fl

None

Unnamed: 0,freedom_to_make_life_choices,generosity,healthy_life_expectancy,ladder_score,logged_gdp_per_capita,perceptions_of_corruption,social_support,year,latitude,longitude
count,2027.0,2027.0,2002.0,2027.0,2027.0,2001.0,2027.0,2027.0,2027.0,2027.0
mean,0.748622,-0.002234,63.709759,5.490372,9.392256,0.740497,0.814963,2013.817958,23.149408,20.849066
std,0.139418,0.16165,7.350222,1.10963,1.139696,0.189123,0.116263,4.517369,24.887794,56.921196
min,0.258,-0.335,32.299999,2.375,6.635,0.035,0.291,2005.0,-40.900558,-106.346771
25%,0.656,-0.118,59.259998,4.668,8.484,0.682,0.751,2010.0,7.946527,-1.561593
50%,0.769,-0.029,65.400002,5.409,9.487,0.799,0.837,2014.0,26.3351,21.824312
75%,0.861,0.089,68.830002,6.308,10.371,0.868,0.907,2018.0,42.602634,47.481766
max,0.985,0.698,77.099998,8.019,11.648,0.983,0.987,2021.0,64.963051,174.885971


In [None]:
exp = EDAExplorer(df = gold_df)
exp.preview()
exp.missing(plot=True)
exp.histograms(exclude=["year", "latitude", "longitude"])
exp.boxplots(exclude=["year", "latitude", "longitude"])
exp.correlations(method="pearson", top_k=15)
exp.geo_scatter(hue="region" if "region" in df.columns else None)



First 5 rows

  country_name  freedom_to_make_life_choices  generosity  \
0  Afghanistan                         0.718       0.168   
1  Afghanistan                         0.679       0.190   
2  Afghanistan                         0.600       0.121   
3  Afghanistan                         0.496       0.162   
4  Afghanistan                         0.531       0.236   

   healthy_life_expectancy  ladder_score  logged_gdp_per_capita  \
0                50.799999         3.724                  7.370   
1                51.200001         4.402                  7.540   
2                51.599998         4.758                  7.647   
3                51.919998         3.832                  7.620   
4                52.240002         3.783                  7.705   

   perceptions_of_corruption regional_indicator  social_support  year country  \
0                      0.882         South Asia           0.451  2008      AF   
1                      0.850         South Asia           0

AttributeError: 'EDAExplorer' object has no attribute 'missingness'