In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def read_csv(csv_url):
    return pd.read_csv(csv_url)

def is_null(df):
    return df.isnull().sum().sort_values(ascending=False)

def outlier_thresholds(df, col_name, q1 = 0.05, q3 = 0.95 ):
    quartile1 = df[col_name].quantile(q1)
    quartile3 = df[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    upper_limit = quartile3 + 1.5 * interquantile_range
    lower_limit = quartile1- 1.5 * interquantile_range
    return lower_limit,upper_limit

def get_outliers(df, col_name):
    # get outlier thresholds
    low, up = outlier_thresholds(df, col_name)
    # Return the outliers
    return df.loc[((df[col_name] < low) | (df[col_name] > up)), col_name]

# Using min-max normalization 
# Range between 0-1
def min_max_norm(df,col):
   return (df[col] - df[col].min()) / (df[col].max() - df[col].min())   

def column_name_replace_space(columns):
  return [col.replace(' ','_') for col in columns]


In [4]:
df_female_employment_ratio = read_csv('data/female-employment-to-population-ratio.csv')

In [5]:
df_female_employment_ratio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3737 entries, 0 to 3736
Data columns (total 4 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   Entity                                                               3737 non-null   object 
 1   Code                                                                 3518 non-null   object 
 2   Year                                                                 3737 non-null   int64  
 3   Employment to population ratio, 15+, female (%) (national estimate)  3737 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 116.9+ KB


In [6]:
df_female_employment_ratio.head()

Unnamed: 0,Entity,Code,Year,"Employment to population ratio, 15+, female (%) (national estimate)"
0,Afghanistan,AFG,2008,42.83
1,Afghanistan,AFG,2012,15.48
2,Afghanistan,AFG,2014,23.18
3,Afghanistan,AFG,2017,18.2
4,Afghanistan,AFG,2020,13.73


In [11]:
df_fertility_female_labor_force = read_csv('data/fertility-and-female-labor-force-participation.csv')

In [12]:
df_fertility_female_labor_force.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58984 entries, 0 to 58983
Data columns (total 7 columns):
 #   Column                                                                                        Non-Null Count  Dtype  
---  ------                                                                                        --------------  -----  
 0   Entity                                                                                        58984 non-null  object 
 1   Code                                                                                          55039 non-null  object 
 2   Year                                                                                          58984 non-null  int64  
 3   Labor force participation rate, female (% of female population ages 15+) (national estimate)  4560 non-null   float64
 4   Fertility rate - Sex: all - Age: all - Variant: estimates                                     18360 non-null  float64
 5   Population (historical est

In [13]:
df_fertility_female_labor_force.head()

Unnamed: 0,Entity,Code,Year,"Labor force participation rate, female (% of female population ages 15+) (national estimate)",Fertility rate - Sex: all - Age: all - Variant: estimates,Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1979,6.83,7.6123,12986378.0,
2,Afghanistan,AFG,2008,43.79,6.3762,26427204.0,
3,Afghanistan,AFG,2012,16.01,5.8302,30466484.0,
4,Afghanistan,AFG,2014,25.78,5.5595,32716214.0,


In [14]:
df_fertility_female_labor_force.describe()

Unnamed: 0,Year,"Labor force participation rate, female (% of female population ages 15+) (national estimate)",Fertility rate - Sex: all - Age: all - Variant: estimates,Population (historical estimates)
count,58984.0,4560.0,18360.0,58252.0
mean,1606.090601,46.813639,4.015418,49010820.0
std,1420.387218,15.365593,2.00074,292554900.0
min,-10000.0,1.93,0.7455,0.0
25%,1833.0,38.1075,2.17435,146084.0
50%,1902.0,48.86,3.61925,1388504.0
75%,1967.0,56.4,5.925125,6600998.0
max,2021.0,94.4,8.8637,7909295000.0


In [15]:
df_share_employment= read_csv('data/share-of-male-vs-female-employment-in-industry.csv')

In [16]:
df_share_employment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58560 entries, 0 to 58559
Data columns (total 7 columns):
 #   Column                                                                          Non-Null Count  Dtype  
---  ------                                                                          --------------  -----  
 0   Entity                                                                          58560 non-null  object 
 1   Code                                                                            55047 non-null  object 
 2   Year                                                                            58560 non-null  int64  
 3   Employment in industry, male (% of male employment) (modeled ILO estimate)      5829 non-null   float64
 4   Employment in industry, female (% of female employment) (modeled ILO estimate)  5829 non-null   float64
 5   Population (historical estimates)                                               58252 non-null  float64
 6   Continent      

In [17]:
df_jobs = read_csv('data\women-can-take-the-same-jobs-as-men.csv')

In [18]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10017 entries, 0 to 10016
Data columns (total 4 columns):
 #   Column                                                        Non-Null Count  Dtype 
---  ------                                                        --------------  ----- 
 0   Entity                                                        10017 non-null  object
 1   Code                                                          10017 non-null  object
 2   Year                                                          10017 non-null  int64 
 3   A woman can get a job in the same way as a man (1=yes; 0=no)  10017 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 313.2+ KB


In [19]:
df_jobs.head()

Unnamed: 0,Entity,Code,Year,A woman can get a job in the same way as a man (1=yes; 0=no)
0,Afghanistan,AFG,1970,1
1,Afghanistan,AFG,1971,1
2,Afghanistan,AFG,1972,1
3,Afghanistan,AFG,1973,1
4,Afghanistan,AFG,1974,1
