In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('lifeexpectancy.csv')

In [4]:
df.head()

Unnamed: 0,Country,Year,Status,Adult_Mortality,BMI,Alcohol,HIV_AIDS,Total_expenditure,Income_composition_of_resources,Schooling,Life_expectancy
0,Canada,2024,Developed,91,23.019938,4.499131,1.080157,7.845313,0.542585,12.046433,90.0
1,Germany,2005,Developed,215,21.950533,8.941229,4.836356,6.70371,0.620611,16.947498,90.0
2,France,2025,Developed,131,22.777076,6.507659,3.575625,14.811973,0.676434,16.698824,90.0
3,Brazil,2011,Developing,340,31.659207,2.466083,4.785674,1.782969,0.4229,10.826091,89.37576
4,Canada,2022,Developed,207,,,2.135525,2.990539,0.88559,8.567364,90.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          1200 non-null   object 
 1   Year                             1200 non-null   int64  
 2   Status                           1200 non-null   object 
 3   Adult_Mortality                  1200 non-null   int64  
 4   BMI                              1080 non-null   float64
 5   Alcohol                          1080 non-null   float64
 6   HIV_AIDS                         1080 non-null   float64
 7   Total_expenditure                1200 non-null   float64
 8   Income_composition_of_resources  1080 non-null   float64
 9   Schooling                        1080 non-null   float64
 10  Life_expectancy                  1200 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 103.3+ KB


In [6]:
df.shape

(1200, 11)

***Data Cleaning***

In [7]:
#year column dropped
df.drop(columns=['Year'],inplace=True)
df.head()

Unnamed: 0,Country,Status,Adult_Mortality,BMI,Alcohol,HIV_AIDS,Total_expenditure,Income_composition_of_resources,Schooling,Life_expectancy
0,Canada,Developed,91,23.019938,4.499131,1.080157,7.845313,0.542585,12.046433,90.0
1,Germany,Developed,215,21.950533,8.941229,4.836356,6.70371,0.620611,16.947498,90.0
2,France,Developed,131,22.777076,6.507659,3.575625,14.811973,0.676434,16.698824,90.0
3,Brazil,Developing,340,31.659207,2.466083,4.785674,1.782969,0.4229,10.826091,89.37576
4,Canada,Developed,207,,,2.135525,2.990539,0.88559,8.567364,90.0


In [8]:
misscol = ['BMI','Alcohol','HIV_AIDS','Total_expenditure','Income_composition_of_resources','Schooling']

In [9]:
df.isnull().sum()

Country                              0
Status                               0
Adult_Mortality                      0
BMI                                120
Alcohol                            120
HIV_AIDS                           120
Total_expenditure                    0
Income_composition_of_resources    120
Schooling                          120
Life_expectancy                      0
dtype: int64

#### Training the model 

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing  import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression       #not that much efficient for this model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [11]:
x = df.drop(columns=['Life_expectancy'],axis=1)
y = df['Life_expectancy']

In [12]:
# train test spilt
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size=0.2)

In [13]:
x_train.shape , x_test.shape

((960, 9), (240, 9))

#### Pre Processing


In [14]:
numeric_col = ['Adult_Mortality','BMI','Alcohol','HIV_AIDS','Total_expenditure','Income_composition_of_resources','Schooling']
categor_col = ['Country']
ordinal_col = ['Status']

In [15]:
# impute + scale
num_transf = Pipeline([
    ('impute',SimpleImputer(strategy='mean')),
    ("scaling",StandardScaler())
])

In [16]:
ord_tranf = Pipeline([
    ("ordinal", OrdinalEncoder(categories=[['Developed','Developing']])),
])

In [17]:
ohe_tranf = Pipeline([
    ('onehot',OneHotEncoder(drop='first',handle_unknown='ignore'))
])

In [18]:
preprocessing = ColumnTransformer([
    ('num',num_transf,numeric_col),
    ('ord',ord_tranf,ordinal_col),
    ('cat',ohe_tranf,categor_col),
])

In [19]:
mypipe = Pipeline([
    ("preprocessor",preprocessing),
    ("model",LinearRegression()),
])

In [20]:
pipeline  = mypipe.fit(x_train,y_train)
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,"[['Developed', 'Developing']]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [21]:
ypred = pipeline.predict(x_test)

In [22]:
r2 = r2_score(y_test,ypred)
print(r2)

0.11561643434390934


In [23]:
mse = mean_squared_error(y_test, ypred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, ypred)

print(f"R² Score: {r2_score(y_test, ypred):.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")

R² Score: 0.116
RMSE: 1.094
MAE: 0.353
