## **Section 1**
**Data Visualization and Analysis**

In [None]:
#libraries
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression

In [None]:
data = pd.read_excel("HDI.xlsx") #Initialising the Data in a Dataframe
data

Unnamed: 0,Country,ISO-3 Code,Year,Human development index (HDI)
0,Aruba,ABW,1990,
1,Aruba,ABW,1991,
2,Aruba,ABW,1992,
3,Aruba,ABW,1993,
4,Aruba,ABW,1994,
...,...,...,...,...
7135,Zimbabwe,ZWE,2015,0.553
7136,Zimbabwe,ZWE,2016,0.558
7137,Zimbabwe,ZWE,2017,0.563
7138,Zimbabwe,ZWE,2018,0.569


In [None]:
data = data.dropna() # Dropping all Null Values
data

Unnamed: 0,Country,ISO-3 Code,Year,Human development index (HDI)
30,Afghanistan,AFG,1990,0.302
31,Afghanistan,AFG,1991,0.307
32,Afghanistan,AFG,1992,0.316
33,Afghanistan,AFG,1993,0.312
34,Afghanistan,AFG,1994,0.307
...,...,...,...,...
7135,Zimbabwe,ZWE,2015,0.553
7136,Zimbabwe,ZWE,2016,0.558
7137,Zimbabwe,ZWE,2017,0.563
7138,Zimbabwe,ZWE,2018,0.569


In [None]:
fig = px.choropleth(
    data_frame = data,
    locations = data["ISO-3 Code"],
    color = data["Human development index (HDI)"],
    hover_name = data["Country"],
    animation_frame = data["Year"],
    animation_group = data["Country"],
    title = "Human Development Index 1990 - 2019",
    color_continuous_scale = "rdbu",
    width = 1250,
    )

fig.update_layout(
    font_family = "Times New Roman",
    font_size = 12,
    legend_itemclick = "toggleothers",
    paper_bgcolor = "#FAF9F6",
    hovermode = "closest",
    hoverlabel_bgcolor = "#FAF9F6",
    hoverlabel_bordercolor = "black",
    transition_duration = 5,
    transition_easing = "exp-in-out"

)
fig.show()

In [None]:
fig = px.choropleth(
    data_frame = data,
    locations = data["ISO-3 Code"],
    color = data["Human development index (HDI)"],
    hover_name = data["Country"],
    animation_frame = data["Year"],
    animation_group = data["Country"],
    title = "Human Development Index 1990 - 2019",
    color_continuous_scale = "rdbu",
    width = 1250,
    scope = "africa"
    )

fig.update_layout(
    font_family = "Times New Roman",
    font_size = 12,
    legend_itemclick = "toggleothers",
    paper_bgcolor = "#FAF9F6",
    hovermode = "closest",
    hoverlabel_bgcolor = "#FAF9F6",
    hoverlabel_bordercolor = "black",
    transition_duration = 5,
    transition_easing = "exp-in-out"

)
fig.show()

In [None]:
def get_HDI(country):
  hdi_2010 = float(data[(data["Country"] == country) & (data["Year"] == 2010)]["Human development index (HDI)"])
  hdi_2019 = float(data[(data["Country"] == country) & (data["Year"] == 2019)]["Human development index (HDI)"])
  percent_change = round(((hdi_2019 - hdi_2010) * 100 / hdi_2010), 4)
  return(percent_change)

In [None]:
table = pd.DataFrame(columns = ("Country", "Percentage Change"))
table["Country"] = data["Country"].unique()
percent_change = list()

for index in table.index:
  country = table["Country"][index]
  percent_change.append(get_HDI(country))

table["Percentage Change"] = percent_change
table

Unnamed: 0,Country,Percentage Change
0,Afghanistan,8.2627
1,Angola,12.3791
2,Albania,6.7114
3,Andorra,3.7037
4,United Arab Emirates,8.5366
...,...,...
183,Samoa,2.4355
184,Yemen,-7.1146
185,South Africa,6.7771
186,Zambia,10.8159


In [None]:
table[(table["Percentage Change"] < 0)].sort_values(by = ["Percentage Change"], ascending = True)

Unnamed: 0,Country,Percentage Change
162,Syrian Arab Republic,-15.625
99,Libya,-9.2732
184,Yemen,-7.1146
180,Venezuela (Bolivarian Republic of),-6.0766
168,Timor-Leste,-3.5032
97,Lebanon,-2.8721
86,Jordan,-1.0855


In [None]:
table.nlargest(15, "Percentage Change")

Unnamed: 0,Country,Percentage Change
160,Eswatini,19.8039
124,Niger,19.0332
187,Zimbabwe,18.4647
14,Burkina Faso,17.7083
45,Djibouti,15.4185
55,Ethiopia,15.2019
33,Côte d'Ivoire,14.9573
64,Guinea,14.6635
103,Lesotho,14.5652
26,Bhutan,13.9373


In [None]:
fig = px.choropleth(
    data_frame = data,
    locations = data["ISO-3 Code"],
    color = data["Human development index (HDI)"],
    hover_name = data["Country"],
    animation_frame = data["Year"],
    animation_group = data["Country"],
    title = "Human Development Index 1990 - 2019",
    color_continuous_scale = "pubu",
    width = 1250,
    scope = "africa"
    )

fig.update_layout(
    font_family = "Times New Roman",
    font_size = 12,
    legend_itemclick = "toggleothers",
    paper_bgcolor = "#FAF9F6",
    hovermode = "closest",
    hoverlabel_bgcolor = "#FAF9F6",
    hoverlabel_bordercolor = "black",
    transition_duration = 5,
    transition_easing = "exp-in-out"

)
fig.show()

## **Section 2**
### Linear Regression

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression

In [None]:
hdi_2021_data = pd.read_excel("HDI_2021.xlsx") #Obtaining the Data
hdi_2021_data

Unnamed: 0,Country,Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita,Human Development Index (HDI)
0,Switzerland,83.9872,16.500299,13.859660,66933.004540,0.962
1,Norway,83.2339,18.185200,13.003630,64660.106220,0.961
2,Iceland,82.6782,19.163059,13.767170,55782.049810,0.959
3,"Hong Kong, China (SAR)",85.4734,17.278170,12.226210,62606.845400,0.952
4,Australia,84.5265,21.054590,12.726820,49238.433350,0.951
...,...,...,...,...,...,...
186,Burundi,61.6627,10.722722,3.129267,731.786709,0.426
187,Central African Republic,53.8947,8.040172,4.334000,966.058611,0.404
188,Niger,61.5763,6.957112,2.116717,1239.866936,0.400
189,Chad,52.5254,8.035914,2.573774,1364.169417,0.394


### Feature scaling

In [None]:
hdi_2021_data["Health Index"] = (hdi_2021_data["Life expectancy at birth"] - 20) / 65
hdi_2021_data["Education Index"] = ((hdi_2021_data["Expected years of schooling"] / 18) + (hdi_2021_data["Mean years of schooling"] / 15)) / 2
hdi_2021_data["Income Index"] = (np.log(hdi_2021_data["Gross national income (GNI) per capita"]) - np.log(100)) / (np.log(75000) - np.log(100))
hdi_2021_data

Unnamed: 0,Country,Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita,Human Development Index (HDI),Health Index,Education Index,Income Index
0,Switzerland,83.9872,16.500299,13.859660,66933.004540,0.962,0.984418,0.920330,0.982810
1,Norway,83.2339,18.185200,13.003630,64660.106220,0.961,0.972829,0.938599,0.977592
2,Iceland,82.6782,19.163059,13.767170,55782.049810,0.959,0.964280,0.991213,0.955282
3,"Hong Kong, China (SAR)",85.4734,17.278170,12.226210,62606.845400,0.952,1.007283,0.887489,0.972717
4,Australia,84.5265,21.054590,12.726820,49238.433350,0.951,0.992715,1.009077,0.936434
...,...,...,...,...,...,...,...,...,...
186,Burundi,61.6627,10.722722,3.129267,731.786709,0.426,0.640965,0.402162,0.300649
187,Central African Republic,53.8947,8.040172,4.334000,966.058611,0.404,0.521457,0.367805,0.342603
188,Niger,61.5763,6.957112,2.116717,1239.866936,0.400,0.639635,0.263810,0.380296
189,Chad,52.5254,8.035914,2.573774,1364.169417,0.394,0.500391,0.309012,0.394728


### Model Building

In [None]:
SEED = 42

y = hdi_2021_data.iloc[:, 5 ]  #Response Variable
X = hdi_2021_data[["Health Index", "Education Index", "Income Index"]] #Independent Variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
print("OLS R2:", reg.score(X_test, y_test))

OLS R2: 0.998407348708885


In [None]:
#Finding Optimum i
model_scores = dict()
for i in np.arange(-5, 5, 0.01):
  X_train_new = np.power(X_train, round(i, 2))
  X_test_new = np.power(X_test, round(i, 2))

  model_new = LinearRegression().fit(X_train_new, y_train)
  model_scores[f'model_{round(i, 2)}'] = model_new.score(X_test_new, y_test)


max(zip(model_scores.values(), model_scores.keys()))

(0.998829493490566, 'model_0.83')

In [None]:
#Linear Regression on the Optimum i
X_train_new = np.power(X_train, 0.83)
X_test_new = np.power(X_test, 0.83)

best_model = LinearRegression().fit(X_train_new, y_train)
improved_y_pred = best_model.predict(X_test_new)
print("Modified Regression R2:", best_model.score(X_test_new, y_test))
print("Model Intercept:", best_model.intercept_)
print("Model Coefficients:", best_model.coef_)

Modified Regression R2: 0.998829493490566
Model Intercept: -0.13992865693669743
Model Coffecients: [0.34646512 0.40562965 0.37859859]


In [None]:
scatter = px.scatter(
    x = y_test,
    y = improved_y_pred,
    labels = {"x" : "Actual Y", "y" : "Predicted Y"},
    trendline = "ols",
    trendline_color_override = "black",
    title = "Actual vs Predicted Plot for Modified Linear Regression"
)

scatter.update_layout(
    font_family = "Times New Roman",
    font_size = 12,
    legend_itemclick = "toggleothers",
    paper_bgcolor = "#FAF9F6",
    hovermode = "closest",
    hoverlabel_bgcolor = "#FAF9F6",
    hoverlabel_bordercolor = "black",
    transition_duration = 5,
    transition_easing = "exp-in-out"

)

scatter.update_traces(marker=dict(color='#DA70D6'))

scatter.show()
