In [1]:
import numpy as np
import  matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import math
from sklearn.model_selection import GridSearchCV



In [2]:
df=pd.read_csv('/content/Life-Expectancy-Data-Updated.csv')

In [3]:
df.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,...,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,...,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,India,Asia,2007,51.5,67.9,201.0765,1.57,60,35,21.2,...,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,Guyana,South America,2006,32.8,40.5,222.1965,5.68,93,74,25.3,...,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,...,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [4]:
print(df.dtypes)

Country                         object
Region                          object
Year                             int64
Infant_deaths                  float64
Under_five_deaths              float64
Adult_mortality                float64
Alcohol_consumption            float64
Hepatitis_B                      int64
Measles                          int64
BMI                            float64
Polio                            int64
Diphtheria                       int64
Incidents_HIV                  float64
GDP_per_capita                   int64
Population_mln                 float64
Thinness_ten_nineteen_years    float64
Thinness_five_nine_years       float64
Schooling                      float64
Economy_status_Developed         int64
Economy_status_Developing        int64
Life_expectancy                float64
dtype: object


In [5]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]


In [6]:
 ct=ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1])], remainder='passthrough')
 X=ct.fit_transform(X)

In [9]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=0)

In [10]:
# Define the models and their hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {'fit_intercept':[True, False]}),
    'Decision Tree': (DecisionTreeRegressor(random_state=2), {'max_depth': [None,20], 'criterion':['squared_error','friedman_mse']}),
    'Random Forest': (RandomForestRegressor(random_state=2), {'n_estimators': [50, 100,150,200], 'max_depth': [None,20,10,12,14]}),
    'Ridge Regression': (Ridge(), {'alpha': [0.2, 1, 4]}),
    'Lasso Regression': (Lasso(), {'alpha': [0.2, 1, 4]})
}

# Initialize variables to keep track of the best RMSE and corresponding model
best_rmse = float('inf')
best_model_name = None
best_model_r2 = None
best_estimator=None
best_model_MAE=None

# Perform GridSearchCV for each model
for model_name, (model, param_grid) in models.items():
    gs = GridSearchCV(model, param_grid, scoring=['neg_root_mean_squared_error','r2'], refit='neg_root_mean_squared_error', cv=5)
    gs.fit(xtrain,ytrain)

    # Evaluate the model on the test set
    ypred=gs.best_estimator_.predict(xtest)
    rmse = (mean_squared_error(ytest, ypred,squared=False))

    # Check if the current model has a better RMSE than the current best
    if rmse < best_rmse:
        best_rmse = rmse
        best_model_name = model_name
        best_estimator=gs.best_estimator_
        best_model_r2=r2_score(ytest,ypred)
        best_model_MAE=mean_absolute_error(ytest,ypred)

# Print the best model based on RMSE
print(f"Best model based on RMSE is : {best_model_name} with RMSE {best_rmse}, R2 score of {best_model_r2} and MAE of {best_model_MAE}")
print(best_estimator.get_params())




Best model based on RMSE is : Random Forest with RMSE 0.5715238977412147, R2 score of 0.9961765655076228 and MAE of 0.3718406209063883
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'n_jobs': None, 'oob_score': False, 'random_state': 2, 'verbose': 0, 'warm_start': False}
