In [None]:
# Here are imported the libraries/modules that are used below for the analysis.

#%matplotlib notebook
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd
import sklearn as sk
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 
import seaborn as sns

In [None]:
# Here I use my local file path on reading the .csv files.
# The reader must use the appropriate path where files are located in their computers.

# I select as header the third row of the file gdp per capita (n=2). The delimiter is ","
GDP=pd.read_csv("/Users/damianejlli/Downloads/gdp per capita.csv", delimiter=",", header=2) 

# There is no need to specify the header for the "better life index.csv" file.
LS=pd.read_csv("/Users/damianejlli/Downloads/better life index.csv")

In [None]:
#I display some of the content of the GDP dataframe.

GDP.head() 

In [None]:
# I select the columns "Country Name" and year "2015" for the analysis in the GDP dataframe.

GPD1=GDP.loc[:,["Country Name", '2015']] 

In [None]:
# I display the GDP1 dataframe content.

GPD1

In [None]:
# I set as index the "Country Name" column and rename the column "2015" to "GPD per capita 2015 (USD)"
# and I set the dataframe in alphabetic order

GDP2=GPD1.set_index("Country Name").rename(columns={"2015": "GDP per capita 2015 (USD)"}).sort_index()

In [None]:
# I show the first ten rows of the GDP2 dataframe as a matter of example
# print(GDP2.to_string())

GDP2.head(10)

In [None]:
# I show the first five rows of the LS dataframe as a matter of example.

LS.head()

In [None]:
# I show the shape of the LS dataframe. It has 2369 rows and 17 columns.

LS.shape 

In [None]:
# I use a conditional to choose all those rows with values "Life satisfaction" in the column "Indicator" 
# and all those equal to "TOT" in the "INEQUALITY" column in the LS dataframe. 
# "TOT" is the total value of life satisfaction for men and women in a given country.

LS1=LS[(LS["Indicator"]=="Life satisfaction") & (LS["INEQUALITY"]=="TOT")] 

In [None]:
# I show the first 10 entries of the LS1 dataframe as a matter of exmple.

LS1.head(10)

In [None]:
# First, in the LS1 dataframe, I rename the columns "Country" and "Value" respectively to "Country Name" and "Life Satisfaction Value".
# Second, I set as index of the new dataframe the "Country Name" and after I select all rows in the "Indicator" column 
# with entries equal to "Life Satisfaction Value". After the results are sorted alphabetically.

LS2=LS1.rename(columns={"Country" : "Country Name", "Value": "Life Satisfaction Value"}).set_index("Country Name").loc[:, ["Life Satisfaction Value"]].sort_index()

In [None]:
# I show the first 10 entries of the LS2 dataframe as a matter of exmple.

LS2.head(10)

In [None]:
# I remove the entry "OECD-Total" country index from the LS2 dataframe because it is unneccessary for the analysis.

LS3=LS2[LS2.index != "OECD - Total"] 

In [None]:
# I show the first 10 entries of the LS3 dataframe as a matter of exmple.

LS3.head(10)

In [None]:
# I join the LS3 dataframe with the GPD2 dataframe in order to form the final dataframe, df.

df=LS3.join(GDP2) 

In [None]:
# I display the entries in the joint dataframe, df.

df

In [None]:
# I remove the NaN values from the "df" dataframe to form the final dataframe for the analysis, "df1".

df1=df.dropna()

In [None]:
# I display the df1 dataframe.

df1

In [None]:
# I calculate the shape of the df1 dataframe. The dataframe has 38 rows and 2 columns.

df1.shape

In [None]:
# I create a scatter plot for the data in the df1 dataframe.

df1.plot(kind="scatter", x="GDP per capita 2015 (USD)", y="Life Satisfaction Value", color="b", figsize=(10,6)) 

In [None]:
# I calculate the Pearson correlation coeffeicient r for the data in the df1 dataframe 
# and display the correlation dataframe.

df1.corr()

In [None]:
# I extract all values of the "GPD per capita 2015 (USD)" and "Life Satisfaction Value" columns and  
# form new (38x1) column arrays "a" and "b".

a=df1.loc[:, ["GDP per capita 2015 (USD)"]].values
b=df1.loc[:, ["Life Satisfaction Value"]].values 

In [None]:
# I reshape the original (38x1) column arrary "a" to a (1x38) row array "X".

X=a.reshape(38)

In [None]:
# I display the "X" array.

X

In [None]:
# I reshape the original (38x1) column "b" array to a (1x38) row array "y".

y=b.reshape(38) 

In [None]:
# I display the "y" array. 
y

In [None]:
# First, I assume a simple linear regression model for the data in "X" and "y" arrays 
# and calculate the slope, intercept etc., of the linear regression method. 
# Here I use the "stats" module of "Scipy" library and its linear regression built in method.

result = sp.stats.linregress(X, y)

In [None]:
# I print the results of the simple linear regression method.

print(result)

In [None]:
# I create a figure with a single subplot where the original data of the df1 dataframe 
# and the linear regression line Y(X) are shown.

fig, ax=plt.subplots(figsize=(10, 6))
ax.scatter(X, y, color='b', label="Original data")
ax.plot(X, result.intercept + (result.slope)*X, color="m", label="Linear regression line: $Y(X)=5.74+2.39\cdot 10^{-5} X$")
ax.set_xlabel("GDP per capita 2015 (USD)")
ax.set_ylabel("Life Satifaction Value")
plt.legend()

In [None]:
# I show the linear regression line together with the 95% confidence interval for the regression parameters
# and the original data by using seaborn module.

fig, ax = plt.subplots(figsize=(10, 6))
sns.regplot(x="GDP per capita 2015 (USD)", y="Life Satisfaction Value", data=df1, ci=95, order=1,line_kws={'label': 'Linear regression line: $Y(X)=5.74+2.39\cdot 10^{-5} X$', 'color': 'm'}, seed=1,truncate=False, label="Original data")
ax.set_xticks([1000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000])
ax.set_yticks(np.arange(3.0, 10.5, 0.5))
ax.legend(loc="upper left")

In [None]:
# I calculate the t-score in order to estimate the Confidence Intervals (CIs)
# of the linear regression coefficients "beta_0" and "beta_1" at significance 
# level of alpha=0.05 and Confidence Level (CL) of 95%. The number of degrees of freedom for the data is n=38. 

n=38
alpha=0.05
t_score = sp.stats.t.ppf(1-alpha/2, n-2) 
print("Model t_score: ", t_score)

In [None]:
# Second, I use the KNN regression method to find a relationship between the data for K=5 (default value).

model=sk.neighbors.KNeighborsRegressor(n_neighbors=5)

In [None]:
# I use the fit() function to fit the data of the KNN method
# and reshape the X and y 1D arrays to form 2D arrays to use for the KNN method.

model.fit(X.reshape(-1, 1),y.reshape(-1, 1))

In [None]:
# I calculate the predicted values of the KNN method for 
# the GDP data "X" not present in the df1 dataframe for the countries of Albania, United Arab Emirates and Armenia.

X_new=[[3952.801215],[38663.383807],[3607.296697]]

In [None]:
# I Print the predicted values of "Life Satisfaction Value" respectively 
# for Albania, United Arab Emirates and Armenia.

print(model.predict(X_new))

In [None]:
# I print the value of the generalized correlation coefficient R^2 for the KNN model for K=5. 

print("Model R^2 value: ", model.score(X.reshape(-1, 1),y.reshape(-1, 1), sample_weight=None))

In [None]:
# Here as a matter of example, I show that what I found for the simple linear regression above
# can be done aslo by using the sklearn module as well.

model = sk.linear_model.LinearRegression(fit_intercept=True)
model.fit(X.reshape(-1, 1),y.reshape(-1, 1))

In [None]:
# I show that the sklearn module gives exactly the same results obtained above with the stats module.

yfit = model.predict(X.reshape(-1, 1))
fig, ax=plt.subplots(figsize=(10, 6))
ax.scatter(X.reshape(-1, 1), y.reshape(-1, 1), label="Original data")
ax.plot(X.reshape(-1, 1), yfit, color="m", label="Linear regression line")
ax.set_xlabel("GDP per capita 2015 (USD)")
ax.set_ylabel("Life Satifaction Value")
ax.legend()


In [None]:
# I print the values of slope and intercept coefficients for the linear model obtained using sklearn.
# The values agree with those obtained with the stats module.

print("Model slope: ", model.coef_[0])
print("Model intercept:", model.intercept_)

In [None]:
# Now I make a test-train analyzis for the data in order to asses model accuracy
# in making new predictions. Here I split the data where only 20% of the data goes into the test set.

X_train, X_test, y_train, y_test=sk.model_selection.train_test_split(X.reshape(-1, 1), y.reshape(-1, 1), test_size=0.2, random_state=0)

In [None]:
# To test the model accuracy of the train-test analysis, First I try the linear regression model.

model1 = sk.linear_model.LinearRegression(fit_intercept=True)
model1.fit(X_train, y_train)

In [None]:
# I print the values of the slope and intercept coefficients obtained from the trainig data.

print("Model_1 slope: ", model1.coef_[0])
print("Model_1 intercept:", model1.intercept_)

In [None]:
# I print the values of the generalized correlation coefficients R^2 of the train and test data of the linear regression model. 

print("Model_1 train R^2 value: ", model1.score(X_train, y_train))
print("Model_1 test R^2 value: ", model1.score(X_test, y_test))

In [None]:
# Second, I try the KNN regression model to test the accuracy of the train-test data analysis.

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # I scale and fit-transform the training predictor data for better stability.
X_test_scaled = scaler.transform(X_test) # I scale the test predictor data.
model2 = sk.neighbors.KNeighborsRegressor(n_neighbors=3) # I use a value of K=3 which gives the best model accuracy.
model2.fit(X_train_scaled, y_train)

In [None]:
# I print the values of the generalized correlation coefficients R^2 of the train and test data of the KNN regression model.

print("Model train R^2 value: ", model2.score(X_train_scaled, y_train))
print("Model test R^2 value: ", model2.score(X_test_scaled, y_test))

In [None]:
# I create a for-loop to evaluate the RMSE and R^2 vules as a function of K to test the KNN model accuracy.
# The value of K=3 gives the best model accuracy.

rmse_val = [] # I create an empty list to store the root-mean-square values of the error.
R_score = [] # I create an empty list to store the values of R^2 score.
    
for K in range(0, 22):
    K = K+1
    model3 = sk.neighbors.KNeighborsRegressor(n_neighbors = K).fit(X_train_scaled, y_train)  #fit the model
    pred=model3.predict(X_test_scaled) # make prediction on test set
    error = np.sqrt(mean_squared_error(y_test, pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    R_score.append(model3.score(X_test_scaled, y_test))
    print('RMSE and R^2 values for k=' , K , 'are respectively:', error, model3.score(X_test_scaled, y_test))

In [None]:
# I show the plot of the MSE and R^2 values vs. K values

K=np.arange(1, 23)

fig, ax=plt.subplots(figsize=(10, 6))
ax.plot(K, np.array(rmse_val)**2, label="test $MSE$ value",marker="o")
ax.plot(K, np.array(R_score), label="test $R^2$ value",marker="o")
ax.set_xlabel("$K$")
ax.set_xticks(K, minor=False)
ax.set_yticks(np.arange(0.0, 0.85, 0.05), minor=False)
ax.set_title("KNN regression model for the test data")
ax.legend()

In [None]:
# I print the predictions of the "Life Satisfaction Values" for Albania, UAE and Armenia using the model
# fitted with the training data for K=3.

print(model2.predict(scaler.transform(X_new)))