In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

In [2]:
# Load 'national.csv' into a DataFrame
national_df = pd.read_csv('data/national.csv')

# Load 'HDI.csv' into another DataFrame
HDI_df = pd.read_csv('data/HDI.csv')

In [3]:
filtered_HDI_df = HDI_df.iloc[:, :4]
years_to_extract = ['1990', '1995', '2000', '2005', '2010']

# Iterate through columns in HDI_df
for col in HDI_df.columns:
    for year in years_to_extract:
        if col.endswith(f"({year})"):
            # Extract the column as a DataFrame
            new_column = HDI_df[[col]]

            # Concatenate the column to first4col_HDI_df
            filtered_HDI_df = pd.concat([filtered_HDI_df, new_column], axis=1)


In [4]:
filtered_HDI_df

Unnamed: 0,ISO3,Country,Human Development Groups,UNDP Developing Regions,Human Development Index (1990),Human Development Index (1995),Human Development Index (2000),Human Development Index (2005),Human Development Index (2010),Life Expectancy at Birth (1990),...,Carbon dioxide emissions per capita (production) (tonnes) (1990),Carbon dioxide emissions per capita (production) (tonnes) (1995),Carbon dioxide emissions per capita (production) (tonnes) (2000),Carbon dioxide emissions per capita (production) (tonnes) (2005),Carbon dioxide emissions per capita (production) (tonnes) (2010),Material footprint per capita (tonnes) (1990),Material footprint per capita (tonnes) (1995),Material footprint per capita (tonnes) (2000),Material footprint per capita (tonnes) (2005),Material footprint per capita (tonnes) (2010)
0,AFG,Afghanistan,Low,SA,0.27,0.31,0.34,0.40,0.45,45.97,...,0.21,0.07,0.04,0.05,0.29,2.33,1.81,2.28,2.02,1.77
1,AGO,Angola,Medium,SSA,,,0.38,0.45,0.51,41.89,...,0.43,0.79,0.58,0.98,1.24,2.44,2.08,2.74,3.39,4.16
2,ALB,Albania,High,ECA,0.65,0.63,0.68,0.71,0.75,73.14,...,1.66,0.66,0.96,1.36,1.51,6.63,5.47,9.44,12.11,11.92
3,AND,Andorra,Very High,,,,0.82,0.83,0.85,78.41,...,7.46,6.66,8.01,7.29,6.12,,,,,
4,ARE,United Arab Emirates,Very High,AS,0.73,0.76,0.80,0.82,0.83,71.90,...,28.28,28.99,35.67,24.95,21.13,64.75,53.11,42.84,42.46,54.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,WSM,Samoa,High,EAP,,0.66,0.68,0.70,0.71,67.66,...,0.54,0.67,0.82,0.94,0.99,,,,,
191,YEM,Yemen,Low,AS,0.38,0.41,0.45,0.49,0.51,58.70,...,0.81,0.69,0.83,0.98,1.00,3.12,3.35,2.96,4.14,3.81
192,ZAF,South Africa,High,SSA,0.63,0.66,0.63,0.63,0.68,63.38,...,8.50,8.73,8.41,8.69,9.12,11.17,12.60,10.58,9.11,9.83
193,ZMB,Zambia,Medium,SSA,0.41,0.41,0.42,0.47,0.53,47.93,...,0.30,0.24,0.17,0.19,0.23,3.18,3.98,2.54,2.93,2.92


In [5]:
filtered_national_df = national_df[(national_df['year'] >= 1990) & (national_df['year'] <= 2010)]
# Columns to drop
columns_to_drop = [
    'christianity_protestant', 'christianity_romancatholic', 'christianity_easternorthodox',
    'christianity_anglican', 'christianity_other', 'christianity_all',
    'judaism_orthodox', 'judaism_conservative', 'judaism_reform',
    'judaism_other', 'judaism_all', 'islam_sunni', 'islam_shi’a',
    'islam_ibadhi', 'islam_nationofislam', 'islam_alawite',
    'islam_ahmadiyya', 'islam_other', 'islam_all', 'buddhism_mahayana',
    'buddhism_theravada', 'buddhism_other', 'buddhism_all',
    'zoroastrianism_all', 'hinduism_all', 'sikhism_all', 'shinto_all',
    'baha’i_all', 'taoism_all', 'jainism_all', 'confucianism_all',
    'syncretism_all', 'animism_all', 'noreligion_all', 'otherreligion_all',
    'religion_all', 'state'
]

# Drop the specified columns
filtered_national_df = filtered_national_df.drop(columns=columns_to_drop)

# Reset index
filtered_national_df = filtered_national_df.reset_index(drop=True)

filtered_national_df

Unnamed: 0,year,code,population,protestant_percent,romancatholic_percent,easternorthodox_percent,anglican_percent,otherchristianity_percent,christianity_percent,orthodox_percent,...,jainism_percent,confucianism_percent,syncretism_percent,animism_percent,noreligion_percent,otherreligion_percent,religion_sumpercent,total_percent,dual_religion,source_code
0,1990,USA,249907000,0.53,0.26,0.03,0.02,0.02,0.86,0.00,...,0.00,0.00,0.00,0.00,0.10,0.00,1.00,1.00,0,25
1,1995,USA,262755000,0.47,0.27,0.03,0.02,0.04,0.82,0.00,...,0.00,0.00,0.00,0.00,0.13,0.00,1.00,1.00,0,99
2,2000,USA,278357000,0.41,0.27,0.02,0.02,0.07,0.79,0.00,...,0.00,0.00,0.00,0.00,0.16,0.00,1.00,1.00,0,25
3,2005,USA,295896000,0.40,0.26,0.02,0.02,0.07,0.77,0.00,...,0.00,0.00,0.00,0.00,0.17,0.00,1.00,1.00,0,99
4,2010,USA,312750000,0.38,0.25,0.02,0.02,0.07,0.75,0.00,...,0.00,0.00,0.00,0.01,0.19,0.00,1.00,1.00,0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
923,1990,WSM,164000,0.60,0.15,0.00,0.00,0.22,0.97,0.00,...,0.00,0.00,0.00,0.00,0.01,0.00,1.00,1.00,0,7
924,1995,WSM,165000,0.60,0.16,0.00,0.00,0.22,0.98,0.00,...,0.00,0.00,0.00,0.00,0.01,0.00,1.00,1.00,0,1
925,2000,WSM,180000,0.77,0.20,0.00,0.00,0.00,0.97,0.00,...,0.00,0.00,0.00,0.00,0.01,0.02,0.98,1.00,0,83
926,2005,WSM,183000,0.77,0.20,0.00,0.00,0.00,0.97,0.00,...,0.00,0.00,0.00,0.00,0.01,0.02,0.98,1.00,0,83


In [6]:
pivoted_national_df = filtered_national_df.pivot(index='code', columns='year')

# Flatten the multi-index columns
pivoted_national_df.columns = [f'{col[0]} ({col[1]})' for col in pivoted_national_df.columns]

# Reset index to convert 'code' to a column
pivoted_national_df.reset_index(inplace=True)

pivoted_national_df

Unnamed: 0,code,population (1990),population (1995),population (2000),population (2005),population (2010),protestant_percent (1990),protestant_percent (1995),protestant_percent (2000),protestant_percent (2005),...,dual_religion (1990),dual_religion (1995),dual_religion (2000),dual_religion (2005),dual_religion (2010),source_code (1990),source_code (1995),source_code (2000),source_code (2005),source_code (2010)
0,AAB,59355.00,61609.00,63863.00,83000.00,86900.00,0.46,0.49,0.53,0.47,...,0.00,0.00,0.00,0.00,0.00,79.00,99.00,79.00,99.00,99.00
1,AFG,15754000.00,19661000.00,22720000.00,26000000.00,27000000.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,7.00,1.00,83.00,83.00,83.00
2,ALB,3256000.00,3609000.00,3113000.00,3142000.00,3195525.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,99.00,1.00,83.00,83.00,121.00
3,ALG,25022000.00,28060000.00,31471000.00,32906000.00,35666324.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,17.00,1.00,17.00,83.00,1.00
4,AND,,68000.00,78000.00,79000.00,85500.00,,0.01,0.00,0.00,...,,0.00,0.00,0.00,0.00,,2.00,83.00,83.00,83.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,YEM,11279000.00,15369000.00,18118000.00,20745000.00,23605382.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,7.00,99.00,83.00,99.00,83.00
194,YPR,2460000.00,,,,,0.00,,,,...,0.00,,,,,13.00,,,,
195,YUG,23818000.00,10547000.00,10640000.00,7780000.00,7292990.00,0.01,0.02,0.01,0.01,...,0.00,0.00,0.00,0.00,0.00,1.00,1.00,125.00,99.00,125.00
196,ZAM,8073000.00,8081000.00,9169000.00,11441000.00,13039656.00,0.24,0.27,0.29,0.29,...,0.00,0.00,0.00,0.00,0.00,7.00,1.00,13.00,83.00,83.00


In [7]:
merged_df = pd.merge(pivoted_national_df, filtered_HDI_df, left_on='code', right_on='ISO3', how='inner')
merged_df

Unnamed: 0,code,population (1990),population (1995),population (2000),population (2005),population (2010),protestant_percent (1990),protestant_percent (1995),protestant_percent (2000),protestant_percent (2005),...,Carbon dioxide emissions per capita (production) (tonnes) (1990),Carbon dioxide emissions per capita (production) (tonnes) (1995),Carbon dioxide emissions per capita (production) (tonnes) (2000),Carbon dioxide emissions per capita (production) (tonnes) (2005),Carbon dioxide emissions per capita (production) (tonnes) (2010),Material footprint per capita (tonnes) (1990),Material footprint per capita (tonnes) (1995),Material footprint per capita (tonnes) (2000),Material footprint per capita (tonnes) (2005),Material footprint per capita (tonnes) (2010)
0,AFG,15754000.00,19661000.00,22720000.00,26000000.00,27000000.00,0.00,0.00,0.00,0.00,...,0.21,0.07,0.04,0.05,0.29,2.33,1.81,2.28,2.02,1.77
1,ALB,3256000.00,3609000.00,3113000.00,3142000.00,3195525.00,0.00,0.00,0.00,0.00,...,1.66,0.66,0.96,1.36,1.51,6.63,5.47,9.44,12.11,11.92
2,AND,,68000.00,78000.00,79000.00,85500.00,,0.01,0.00,0.00,...,7.46,6.66,8.01,7.29,6.12,,,,,
3,ARG,32527000.00,34768000.00,37032000.00,38592000.00,40399992.00,0.02,0.02,0.09,0.06,...,3.44,3.67,3.86,4.15,4.57,11.08,12.69,14.36,13.39,14.59
4,ARM,,3759000.00,3520000.00,3218000.00,3245781.00,,0.00,0.01,0.01,...,2.53,1.07,1.14,1.47,1.48,,3.09,3.86,6.02,6.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,USA,249907000.00,262755000.00,278357000.00,295896000.00,312750000.00,0.53,0.47,0.41,0.40,...,20.28,20.45,21.34,20.80,18.37,28.34,30.06,34.24,38.07,28.56
83,UZB,,22690000.00,24318000.00,26167000.00,28160570.00,,0.00,0.00,0.00,...,5.07,4.50,4.88,4.41,3.62,,5.73,4.87,5.04,5.32
84,VEN,19325000.00,20712000.00,24170000.00,26577000.00,28834000.00,0.01,0.01,0.15,0.15,...,6.21,6.06,6.28,6.20,6.47,6.88,10.73,9.33,8.99,14.77
85,WSM,164000.00,165000.00,180000.00,183000.00,188861.00,0.60,0.60,0.77,0.77,...,0.54,0.67,0.82,0.94,0.99,,,,,


In [8]:
columns = merged_df.columns.tolist()

if 'Country' in merged_df.columns:
    # Get list of columns
    columns = merged_df.columns.tolist()

    # Move 'Country' column to the second position
    columns.insert(1, columns.pop(columns.index('Country')))

    # Reorder the DataFrame columns
    merged_df = merged_df[columns]

In [9]:
merged_df.to_csv("data/merged_df.csv", index=True)

In [10]:
# Count the number of NaNs in each column
nan_counts = merged_df.isnull().sum()

# Sort the columns by the number of NaNs (ascending order)
sorted_nan_counts = nan_counts.sort_values()

print(sorted_nan_counts)

code                                                                 0
Life Expectancy at Birth (2010)                                      0
Life Expectancy at Birth, male (1995)                                0
Adolescent Birth Rate (births per 1,000 women ages 15-19) (1990)     0
Life Expectancy at Birth, male (2000)                                0
                                                                    ..
Inequality-adjusted Human Development Index (2010)                  32
Coefficient of human inequality (2010)                              32
HDI female (1990)                                                   37
HDI male (1990)                                                     37
Gender Development Index (1990)                                     37
Length: 336, dtype: int64


In [11]:
 merged_df[merged_df['code']=='AFG']['Expected Years of Schooling (2000)']

0   5.53
Name: Expected Years of Schooling (2000), dtype: float64

In [12]:
selected_columns = [
    'code',
    'Country',
    'population (2010)',
    'noreligion_percent (2010)',
    'Human Development Index (2010)',
    'Expected Years of Schooling (2000)',
    'Gross National Income Per Capita (2010)'
]

# Subset the DataFrame with selected columns
selected_data = merged_df[selected_columns]
selected_data

Unnamed: 0,code,Country,population (2010),noreligion_percent (2010),Human Development Index (2010),Expected Years of Schooling (2000),Gross National Income Per Capita (2010)
0,AFG,Afghanistan,27000000.00,0.00,0.45,5.53,1938.32
1,ALB,Albania,3195525.00,0.15,0.75,10.59,10643.06
2,AND,Andorra,85500.00,0.08,0.85,10.80,48410.27
3,ARG,Argentina,40399992.00,0.12,0.83,15.54,22734.97
4,ARM,Armenia,3245781.00,0.03,0.75,11.19,9745.43
...,...,...,...,...,...,...,...
82,USA,United States,312750000.00,0.19,0.91,15.45,54265.34
83,UZB,Uzbekistan,28160570.00,0.02,0.67,10.81,4943.03
84,VEN,Venezuela,28834000.00,0.04,0.76,10.59,18702.90
85,WSM,Samoa,188861.00,0.01,0.71,12.00,6053.16


In [13]:
selected_columns = [
    'population (2010)',
    'noreligion_percent (2010)',
    'Human Development Index (2010)',
    'Expected Years of Schooling (2010)',
    'Gross National Income Per Capita (2010)'
]

# Subset the DataFrame with selected columns
selected_data = merged_df[selected_columns]

# Calculate mean, median, and mode
mean_values = selected_data.mean()
median_values = selected_data.median()
mode_values = selected_data.mode().iloc[0]  

# Calculate standard deviation and variance
std_deviation = selected_data.std()

# Display the calculated statistics
statistics_df = pd.DataFrame({
    'Mean': mean_values,
    'Median': median_values,
    'Mode': mode_values,
    'Standard Deviation': std_deviation,
})

statistics_df

Unnamed: 0,Mean,Median,Mode,Standard Deviation
population (2010),55719074.0,9410422.5,10067.0,194979018.07
noreligion_percent (2010),0.08,0.03,0.0,0.12
Human Development Index (2010),0.7,0.71,0.91,0.15
Expected Years of Schooling (2010),12.83,12.82,6.02,2.83
Gross National Income Per Capita (2010),17759.51,10165.18,1046.88,20575.47


In [14]:
selected_columns = [
    'population (2010)',
    'noreligion_percent (2010)',
    'Human Development Index (2010)',
    'Expected Years of Schooling (2010)',
    'Gross National Income Per Capita (2010)'
]

# Subset the DataFrame with selected columns
merged_df[selected_columns]

Unnamed: 0,population (2010),noreligion_percent (2010),Human Development Index (2010),Expected Years of Schooling (2010),Gross National Income Per Capita (2010)
0,27000000.00,0.00,0.45,9.00,1938.32
1,3195525.00,0.15,0.75,13.00,10643.06
2,85500.00,0.08,0.85,11.67,48410.27
3,40399992.00,0.12,0.83,16.97,22734.97
4,3245781.00,0.03,0.75,13.09,9745.43
...,...,...,...,...,...
82,312750000.00,0.19,0.91,16.00,54265.34
83,28160570.00,0.02,0.67,11.47,4943.03
84,28834000.00,0.04,0.76,13.64,18702.90
85,188861.00,0.01,0.71,12.21,6053.16


In [15]:
selected_countries = ['Afghanistan', 'Australia', 'Norway', 'Saudi Arabia', 'India']

# Filter the dataset for selected countries
selected_df = merged_df[merged_df['Country'].isin(selected_countries)]

# Calculate mean or median for specific years for selected countries
mean_1990_selected = selected_df.filter(like='(1990)').mean()
median_1995_selected = selected_df.filter(like='(1995)').median()

print(mean_1990_selected[['protestant_percent (1990)', 'Human Development Index (1990)']])
print(median_1995_selected[['protestant_percent (1995)', 'Human Development Index (1995)']])
    


protestant_percent (1990)        0.19
Human Development Index (1990)   0.62
dtype: float64
protestant_percent (1995)        0.02
Human Development Index (1995)   0.71
dtype: float64


In [16]:
columns_of_interest = ["population (2010)","Human Development Index (2010)", "Expected Years of Schooling (2010)", "noreligion_percent (2010)", "Gross National Income Per Capita (2010)"]

for column in columns_of_interest:
    # Calculate Z-scores for the selected column
    z_scores = (merged_df[column] - merged_df[column].mean()) / merged_df[column].std()
    
    # Define a threshold for identifying outliers (e.g., Z-score > 3)
    threshold = 3
    
    # Identify outliers based on the threshold
    outliers = merged_df[z_scores.abs() > threshold]
    
    # Display potential outliers for each column
    print(f"Potential Outliers for {column}:")
    print(outliers["noreligion_percent (2010)"])
    print("\n")

Potential Outliers for population (2010):
15   0.33
37   0.00
Name: noreligion_percent (2010), dtype: float64


Potential Outliers for Human Development Index (2010):
Series([], Name: noreligion_percent (2010), dtype: float64)


Potential Outliers for Expected Years of Schooling (2010):
5   0.15
Name: noreligion_percent (2010), dtype: float64


Potential Outliers for noreligion_percent (2010):
27   0.69
63   0.64
Name: noreligion_percent (2010), dtype: float64


Potential Outliers for Gross National Income Per Capita (2010):
49   0.05
64   0.02
Name: noreligion_percent (2010), dtype: float64




In [17]:
merged_df

Unnamed: 0,code,Country,population (1990),population (1995),population (2000),population (2005),population (2010),protestant_percent (1990),protestant_percent (1995),protestant_percent (2000),...,Carbon dioxide emissions per capita (production) (tonnes) (1990),Carbon dioxide emissions per capita (production) (tonnes) (1995),Carbon dioxide emissions per capita (production) (tonnes) (2000),Carbon dioxide emissions per capita (production) (tonnes) (2005),Carbon dioxide emissions per capita (production) (tonnes) (2010),Material footprint per capita (tonnes) (1990),Material footprint per capita (tonnes) (1995),Material footprint per capita (tonnes) (2000),Material footprint per capita (tonnes) (2005),Material footprint per capita (tonnes) (2010)
0,AFG,Afghanistan,15754000.00,19661000.00,22720000.00,26000000.00,27000000.00,0.00,0.00,0.00,...,0.21,0.07,0.04,0.05,0.29,2.33,1.81,2.28,2.02,1.77
1,ALB,Albania,3256000.00,3609000.00,3113000.00,3142000.00,3195525.00,0.00,0.00,0.00,...,1.66,0.66,0.96,1.36,1.51,6.63,5.47,9.44,12.11,11.92
2,AND,Andorra,,68000.00,78000.00,79000.00,85500.00,,0.01,0.00,...,7.46,6.66,8.01,7.29,6.12,,,,,
3,ARG,Argentina,32527000.00,34768000.00,37032000.00,38592000.00,40399992.00,0.02,0.02,0.09,...,3.44,3.67,3.86,4.15,4.57,11.08,12.69,14.36,13.39,14.59
4,ARM,Armenia,,3759000.00,3520000.00,3218000.00,3245781.00,,0.00,0.01,...,2.53,1.07,1.14,1.47,1.48,,3.09,3.86,6.02,6.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,USA,United States,249907000.00,262755000.00,278357000.00,295896000.00,312750000.00,0.53,0.47,0.41,...,20.28,20.45,21.34,20.80,18.37,28.34,30.06,34.24,38.07,28.56
83,UZB,Uzbekistan,,22690000.00,24318000.00,26167000.00,28160570.00,,0.00,0.00,...,5.07,4.50,4.88,4.41,3.62,,5.73,4.87,5.04,5.32
84,VEN,Venezuela,19325000.00,20712000.00,24170000.00,26577000.00,28834000.00,0.01,0.01,0.15,...,6.21,6.06,6.28,6.20,6.47,6.88,10.73,9.33,8.99,14.77
85,WSM,Samoa,164000.00,165000.00,180000.00,183000.00,188861.00,0.60,0.60,0.77,...,0.54,0.67,0.82,0.94,0.99,,,,,


In [18]:
columns = merged_df.columns

In [19]:
# Dimension Reduction for analyze

needed_columns = ["population",
                     "Human Development Index",
                     "Gross National Income Per Capita",
                     "Life Expectancy at Birth",
                     "Gender Inequality Index",
                     "Material footprint per capita (tonnes)",
                     "Expected Years of Schooling",
                     "noreligion_percent"]

all_needed_columns = ["code", "Country"]
for column in columns: 
    for needed_column in needed_columns:
        if needed_column in column: 
            # Check if the column does not contain 'male', 'female', or 'adjusted'
            if not any(word in column for word in ["male", "female", "adjusted"]):
                all_needed_columns.append(column)

all_needed_columns

['code',
 'Country',
 'population (1990)',
 'population (1995)',
 'population (2000)',
 'population (2005)',
 'population (2010)',
 'noreligion_percent (1990)',
 'noreligion_percent (1995)',
 'noreligion_percent (2000)',
 'noreligion_percent (2005)',
 'noreligion_percent (2010)',
 'Human Development Index (1990)',
 'Human Development Index (1995)',
 'Human Development Index (2000)',
 'Human Development Index (2005)',
 'Human Development Index (2010)',
 'Life Expectancy at Birth (1990)',
 'Life Expectancy at Birth (1995)',
 'Life Expectancy at Birth (2000)',
 'Life Expectancy at Birth (2005)',
 'Life Expectancy at Birth (2010)',
 'Expected Years of Schooling (1990)',
 'Expected Years of Schooling (1995)',
 'Expected Years of Schooling (2000)',
 'Expected Years of Schooling (2005)',
 'Expected Years of Schooling (2010)',
 'Gross National Income Per Capita (1990)',
 'Gross National Income Per Capita (1995)',
 'Gross National Income Per Capita (2000)',
 'Gross National Income Per Capita (2

In [20]:
df_analyze = merged_df[all_needed_columns]

In [21]:
def convert_df(df):
    # Melting the DataFrame without 'code' and 'Country' columns
    df_melted = df.melt(var_name="variable", value_name="value")

    # Extracting the year and measure from the 'variable' column
    df_melted[['measure', 'year']] = df_melted['variable'].str.rsplit(' ', n=1, expand=True)

    # Dropping the original 'variable' column as it's no longer needed
    df_melted.drop('variable', axis=1, inplace=True)

    # Pivoting the DataFrame to have years as rows
    df_pivoted = df_melted.pivot_table(index="year", columns="measure", values="value").reset_index()

    # Displaying the reshaped DataFrame
    return df_pivoted

In [22]:
afg_df = df_analyze[df_analyze["Country"] == "Afghanistan"]
converted_afg_df = convert_df(afg_df)
converted_afg_df

measure,year,Expected Years of Schooling,Gender Inequality Index,Gross National Income Per Capita,Human Development Index,Life Expectancy at Birth,Material footprint per capita (tonnes),noreligion_percent,population
0,(1990),2.5,,2684.55,0.27,45.97,2.33,0.01,15754000.0
1,(1995),4.02,,1339.09,0.31,52.54,1.81,0.01,19661000.0
2,(2000),5.53,,985.0,0.34,55.3,2.28,0.01,22720000.0
3,(2005),7.66,0.75,1435.19,0.4,58.36,2.02,0.01,26000000.0
4,(2010),9.0,0.75,1938.32,0.45,60.85,1.77,0.0,27000000.0


In [23]:
ind_df = df_analyze[df_analyze["Country"] == "India"]
converted_ind_df = convert_df(ind_df)
converted_ind_df

measure,year,Expected Years of Schooling,Gender Inequality Index,Gross National Income Per Capita,Human Development Index,Life Expectancy at Birth,Material footprint per capita (tonnes),noreligion_percent,population
0,(1990),8.0,0.71,1790.41,0.43,58.65,3.14,0.01,834697024.0
1,(1995),8.15,0.7,2082.0,0.46,60.6,3.07,0.01,915971008.0
2,(2000),8.31,0.66,2552.8,0.49,62.67,3.57,0.0,1028662016.0
3,(2005),9.65,0.62,3222.76,0.53,65.0,3.37,0.0,1115318016.0
4,(2010),10.74,0.58,4189.43,0.57,66.91,4.52,0.0,1195000000.0


In [24]:
sau_df = df_analyze[df_analyze["Country"] == "Saudi Arabia"]
converted_sau_df = convert_df(sau_df)
converted_sau_df

measure,year,Expected Years of Schooling,Gender Inequality Index,Gross National Income Per Capita,Human Development Index,Life Expectancy at Birth,Material footprint per capita (tonnes),noreligion_percent,population
0,(1990),9.59,,43005.69,0.68,68.95,20.64,0.0,14870000.0
1,(1995),10.54,,43542.21,0.71,71.01,18.73,0.0,18255000.0
2,(2000),11.58,,42183.12,0.74,72.47,18.91,0.01,20808000.0
3,(2005),12.73,0.67,44734.86,0.78,74.59,20.25,0.02,23119000.0
4,(2010),14.0,0.65,44723.27,0.82,75.76,28.55,0.02,26090784.0


In [25]:
aus_df = df_analyze[df_analyze["Country"] == "Australia"]
converted_aus_df = convert_df(aus_df)
converted_aus_df

measure,year,Expected Years of Schooling,Gender Inequality Index,Gross National Income Per Capita,Human Development Index,Life Expectancy at Birth,Material footprint per capita (tonnes),noreligion_percent,population
0,(1990),17.54,0.18,29069.15,0.86,77.04,34.68,0.09,7729000.0
1,(1995),18.93,0.18,32280.32,0.88,78.26,35.89,0.09,8047000.0
2,(2000),20.77,0.15,37296.64,0.9,79.63,41.81,0.12,8211000.0
3,(2005),21.54,0.14,41039.09,0.91,81.11,45.32,0.14,8233000.0
4,(2010),22.32,0.14,42840.46,0.92,82.06,45.4,0.15,8389771.0


In [26]:
nor_df = df_analyze[df_analyze["Country"] == "Norway"]
converted_nor_df = convert_df(nor_df)
converted_nor_df

measure,year,Expected Years of Schooling,Gender Inequality Index,Gross National Income Per Capita,Human Development Index,Life Expectancy at Birth,Material footprint per capita (tonnes),noreligion_percent,population
0,(1990),13.95,0.13,36062.79,0.84,76.59,29.64,0.08,4241000.0
1,(1995),15.57,0.11,41251.69,0.87,77.8,32.18,0.06,4359000.0
2,(2000),17.5,0.1,54793.9,0.91,78.69,41.07,0.07,4465000.0
3,(2005),17.49,0.07,63543.38,0.93,80.17,35.98,0.07,4623000.0
4,(2010),17.55,0.07,64988.88,0.94,81.04,40.43,0.11,4885082.0


In [27]:
from scipy.stats import pearsonr

# Example for one country
correlation, p_value_nor = pearsonr(converted_nor_df['noreligion_percent'], converted_nor_df['Human Development Index'])


In [28]:
data_frames = [converted_nor_df, converted_aus_df, converted_sau_df, converted_ind_df, converted_afg_df]
country_codes = ["nor", "aus", "sau", "ind", "afg"]
correlations = {}

for df, code in zip(data_frames, country_codes):
    correlation = pearsonr(df['noreligion_percent'], df['Human Development Index'])
    correlations[code] = correlation

correlations

{'nor': PearsonRResult(statistic=0.48349338601717295, pvalue=0.409302054412155),
 'aus': PearsonRResult(statistic=0.9668898289490313, pvalue=0.007196266451141036),
 'sau': PearsonRResult(statistic=0.9481837822241447, pvalue=0.014048430599804105),
 'ind': PearsonRResult(statistic=-0.8305134483596217, pvalue=0.08159724665143972),
 'afg': PearsonRResult(statistic=-0.8715473904602706, pvalue=0.05418739236913973)}

In [29]:
country_codes = ["nor", "aus", "sau", "ind", "afg"]
correlations = {}

for df, code in zip(data_frames, country_codes):
    correlation = pearsonr(df['Expected Years of Schooling'], df['Human Development Index'])
    correlations[code] = correlation

correlations

{'nor': PearsonRResult(statistic=0.9716021354913604, pvalue=0.005720101536399274),
 'aus': PearsonRResult(statistic=0.988708949000674, pvalue=0.0014378000579207118),
 'sau': PearsonRResult(statistic=0.9989546828798188, pvalue=4.056372636370809e-05),
 'ind': PearsonRResult(statistic=0.9620363677448542, pvalue=0.008828701276226644),
 'afg': PearsonRResult(statistic=0.9928507675171755, pvalue=0.0007248642958906687)}

In [37]:
import statsmodels.api as sm

# List of dataframes for each country
data_frames = [converted_nor_df, converted_aus_df, converted_sau_df, converted_ind_df, converted_afg_df]
country_codes = ["Norway", "Australia", "Saudi Arabia", "India", "Afghanistan"]
regression_results = {}

for df, country in zip(data_frames, country_codes):
    # Convert 'year' to a numerical value, removing parentheses if necessary
    #df['year'] = df['year'].str.extract('(\d+)').astype(float) # Run this line only once

    # Define the independent variables
    X = df[['noreligion_percent', 'Expected Years of Schooling', 'year']]
    y = df['Human Development Index']

    # Add a constant to the model (for the intercept)
    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Store the summary of the model in the results dictionary
    regression_results[country] = model.summary()

# To view the results for a specific country, you can print it out
# For example, to view the results for Norway:
print(regression_results)


{'Norway': <class 'statsmodels.iolib.summary.Summary'>
"""
                               OLS Regression Results                              
Dep. Variable:     Human Development Index   R-squared:                       0.997
Model:                                 OLS   Adj. R-squared:                  0.990
Method:                      Least Squares   F-statistic:                     130.6
Date:                     Tue, 02 Jan 2024   Prob (F-statistic):             0.0642
Time:                             13:21:45   Log-Likelihood:                 24.098
No. Observations:                        5   AIC:                            -40.20
Df Residuals:                            1   BIC:                            -41.76
Df Model:                                3                                         
Covariance Type:                 nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----

  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
