# we need to predict the happiness score considering all the other factors mentioned in the dataset

## LinearRegressionModel

In [None]:
#lets import necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
#read csv file
data = pd.read_csv('happiness_score_dataset.csv')
data.head()

In [None]:
#check columns
data.columns

In [None]:
# how big is the dataset
data.shape

In [None]:
# check the null values
data.isnull().sum()

In [None]:
# statistics of the dataset
data.describe()

In [None]:
# Documentation
#1) there is no null values
#2) after looking into all the data i think its ok 
#3) i will remove happiness rank column because it is useless and it act like a numbering to row


In [None]:
#spliting the data into dependent variable and independent variable
X=data.drop(columns=['Country', 'Region', 'Happiness Rank', 'Happiness Score'],axis=1)
y=data['Happiness Score']

In [None]:
# lets see how data is distributed
plt.figure(figsize=(20,25) , facecolor='yellow')
plotnumber = 1

for column in X:
    
    if plotnumber <= 9:
        ax=plt.subplot(3,3,plotnumber)
        sns.distplot(data[column])
        plt.xlabel(column , fontsize=10)
        
    plotnumber += 1
plt.show()

In [None]:
#skewness in terms of number
X.skew().sort_values(ascending=False)

In [None]:
#applying Power Transformation beacuse our dataset is small and we want to keep outlier
from sklearn.preprocessing import power_transform , PowerTransformer
pt=PowerTransformer()

In [None]:
#transformed data
X_scaled=pt.fit_transform(X)
X_scaled

In [None]:
#again checking the skewness
pd.DataFrame(X_scaled,columns=X.columns).skew().sort_values(ascending=False)

In [None]:
#we will convert numpy array into dataframe
X_new=pd.DataFrame(X_scaled,columns=X.columns)

In [None]:
X_new.skew().sort_values(ascending=False)

In [None]:
#y=data['Happiness Score']

In [None]:
#Best Random State
MaxAccu=0
MaxRS=0

for i in range (0,11):
    X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,random_state=i)
    regression=LinearRegression()
    regression.fit(X_train,y_train)
    
    pred=regression.predict(X_train)
    training=regression.score(X_train,y_train)
    print ('Training Score' , training , 'RandomState' ,i)
    
    y_pred=regression.predict(X_test)
    testing=regression.score(X_test,y_test)
    print ('Testing Score' , testing , 'RandomState' ,i)
    print('\n')
    
    
    if testing>MaxAccu:
        MaxAccu=testing
        MaxRS=i
        print('MAXINING TESTING SCORE' , MaxAccu , 'ON RANDOM STATE OF' , i)
        

In [None]:
print('Best Accuracy is ' , MaxAccu , 'On Random State' , MaxRS)

In [None]:
#splliting our data into train test split and randomstate 6
X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,random_state=6)

In [None]:
#Training the data on Linear Regression Model
regression=LinearRegression()
regression.fit(X_train,y_train)

In [None]:
#training score
regression.score(X_train,y_train)  

In [None]:
#testing score
regression.score(X_test,y_test)      

In [None]:
y_pred=regression.predict(X_test)
y_pred

In [None]:
# how our pred vs actual look like in graph 
plt.scatter(y_test , y_pred)
plt.xlabel('Actual happiness score')
plt.ylabel('Predicted happiness score')
plt.title('Actual VS Model Prediction')
plt.show()

In [None]:
#error in terms of numbers
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
# again predict 
y_pred = regression.predict(X_test)

In [None]:
#MAE
mean_absolute_error(y_test,y_pred)

In [None]:
#MSE
mean_squared_error(y_test,y_pred)

In [None]:
#RMSE
np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
#now we sucessfully created model and our score is great so we check it for overfitting 
#to check overfitting we have to use regularization technique

# in regularization we have 3 type but we use only 2 



# 3 types are 1) LASSO (L1 form) 2)RIDGE (L2 FORM)  3)ELASTIONET

In [None]:
#BASICALLY LASSO N RIDGE WILL GIVE YOU BEST LEARNING RATE THAT IS ALPHA SCORE AND TRAIN MODEL ACCORDING TO IT

In [None]:
#import library
from sklearn.linear_model import Ridge,Lasso,RidgeCV,LassoCV

# LASSO MODEL

In [None]:
##### LASSO MODEL######

lasscv = LassoCV(alphas = None , max_iter = 100, normalize = True)

lasscv.fit(X_train , y_train)

In [None]:
# best aplha parameter
alpha = lasscv.alpha_
alpha

In [None]:
# now we have best parametr noe train according to it
lasso_reg = Lasso(alpha)
lasso_reg.fit(X_train,y_train)

In [None]:
# now check r2 score
lasso_reg.score(X_test,y_test)

# RIDGE MODEL

In [None]:
############ RIDGE MODEL#########

ridgecv = RidgeCV(alphas = np.arange(0.001,0.1,0.01), normalize = True)
ridgecv.fit(X_train , y_train)


In [None]:
# best aplha parameter
alpha = ridgecv.alpha_
alpha

In [None]:
# now we have best parametr noe train according to it
ridge_reg = Ridge(alpha)
ridge_reg.fit (X_train,y_train)

In [None]:
# now check r2 score
ridge_reg.score(X_test,y_test)

In [None]:
# after seeing LASSO and RIDGE score thus we can conclude that our model was not overfitted model
## for conformation lets see cross validation of the model

# Cross Vaildation

In [None]:
#Cross Vaildation
training=regression.score(X_train,y_train)
testing=regression.score(X_test,y_test)

from sklearn.model_selection import cross_val_score
for j in range(2,10):
    cv_score=cross_val_score(regression,X_new,y,cv=j)
    cv_mean=cv_score.mean()
    print(f'At cross fold {j} the cv score is {cv_mean} and the R2 score for Training is {training} and R2 score for the Testing is{testing}')
    print('\n')

In [None]:
# as we see all are giving same result so out model is best model without overfitting 

# Training Score for Linear Regression = 0.9907160861218112
# Testing Score for Linear Regression =  0.9954846871242079

# RandomForestRegressor

In [None]:
# lets try another model 

# RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
# define parameters
parameters={'criterion':['mse','mae','poisson'],
           'max_features':['auto','sqrt','log2'],
           'min_samples_split':[1,11],
           'max_depth':[1,15],
           'min_samples_leaf':[1,7]}

In [None]:
rf=RandomForestRegressor()
clf=GridSearchCV(rf,parameters)
clf.fit(X_train,y_train)

In [None]:
#print best parameters
print(clf.best_params_)

In [None]:
from sklearn.metrics import r2_score

In [None]:
#reassign best parameters
rf=RandomForestRegressor(criterion= 'mse', max_depth= 15, max_features= 'auto', min_samples_leaf= 1, min_samples_split= 11)
rf.fit(X_train,y_train)
print ('Training R2 Score: ' ,rf.score(X_train,y_train)*100)
pred_decision=rf.predict(X_test)

rfs = r2_score(y_test,pred_decision)
print('Testing R2 Score:' , rfs*100)



In [None]:
# Training Score for RandomForestRegressor = 95.58203290686926
# Testing Score for RandomForestRegressor =  89.56146461532899

# KNeighborsRegressor

In [None]:
# lets try another model 

#KNeighborsRegressor

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# define parameters
parameterss={'algorithm':['kd_tree' , 'brute'],
           'leaf_size':[3,5,10,15,30],
           'n_neighbors':[3,15,7,50,20]}

In [None]:
knn=KNeighborsRegressor()
clf=clf=GridSearchCV(knn,parameterss)
clf.fit(X_train,y_train)

In [None]:
#print best parameters
print(clf.best_params_)

In [None]:
#reassign best parameters
knn=KNeighborsRegressor(algorithm= 'kd_tree', leaf_size= 3, n_neighbors= 7)
knn.fit(X_train,y_train)
print ('Training R2 Score: ' ,knn.score(X_train,y_train)*100)
pred_decision=rf.predict(X_test)

rfs = r2_score(y_test,pred_decision)
print('Testing R2 Score:' , rfs*100)



In [None]:
# Training Score for KNeighborsRegressor = 91.1742654070661
# Testing Score for KNeighborsRegressor = 89.56146461532899

In [None]:
# thus we conclude from Top 3 model we select Linear Regreesion for this perticular model beacuse it score is best among 3 models
# Training Score for Linear Regression = 0.9907160861218112
# Testing Score for Linear Regression =  0.9954846871242079