In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import pickle
df = pd.read_csv('Life Expectancy Data.csv')

In [32]:
print(df.head())
df.shape
df.info()

       Country  Year      Status  Life expectancy   Adult Mortality  \
0  Afghanistan  2015  Developing              65.0            263.0   
1  Afghanistan  2014  Developing              59.9            271.0   
2  Afghanistan  2013  Developing              59.9            268.0   
3  Afghanistan  2012  Developing              59.5            272.0   
4  Afghanistan  2011  Developing              59.2            275.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   ...  \
0             62     0.01               71.279624         65.0      1154  ...   
1             64     0.01               73.523582         62.0       492  ...   
2             66     0.01               73.219243         64.0       430  ...   
3             69     0.01               78.184215         67.0      2787  ...   
4             71     0.01                7.097109         68.0      3013  ...   

   Polio  Total expenditure  Diphtheria    HIV/AIDS         GDP  Population  \
0    6.

In [33]:
#Check for missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("Missing Values:\n", missing_values)

Missing Values:
 Life expectancy                     10
Adult Mortality                     10
Alcohol                            194
Hepatitis B                        553
 BMI                                34
Polio                               19
Total expenditure                  226
Diphtheria                          19
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64


In [34]:
#Verify column names
print("Column Names:\n", df.columns)

Column Names:
 Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')


In [35]:
#Fix missing values by adding in median
df.columns = df.columns.str.strip()

num_cols_with_missing = [
    'Life expectancy', 'Adult Mortality', 'Alcohol', 'Hepatitis B', 'BMI',
    'Polio', 'Total expenditure', 'Diphtheria', 'GDP', 'Population',
    'thinness  1-19 years', 'thinness 5-9 years',
    'Income composition of resources', 'Schooling'
]

for col in num_cols_with_missing:
    median = df[col].median()
    df[col] = df[col].fillna(median)

print("Missing Values After Filling with Median:\n", df.isnull().sum())

Missing Values After Filling with Median:
 Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness  1-19 years               0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64


In [36]:
#Handle Duplicates
df = df.drop_duplicates()

print("Number of duplicate rows removed:", df.duplicated().sum())
print("Shape of dataset after removing duplicates:", df.shape)

Number of duplicate rows removed: 0
Shape of dataset after removing duplicates: (2938, 22)


In [37]:
# Identify categorical columns
categorical_cols = ['Country', 'Status']

for col in categorical_cols:
    unique_values = df[col].unique()
    print(f"Unique values in '{col}':\n", unique_values)

Unique values in 'Country':
 ['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Antigua and Barbuda'
 'Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bhutan' 'Bolivia (Plurinational State of)' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso'
 'Burundi' "Côte d'Ivoire" 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Congo' 'Cook Islands' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia'
 "Democratic People's Republic of Korea"
 'Democratic Republic of the Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia'
 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea'
 'Guinea-Bissau' 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland' 'India'
 'Indonesia'

In [38]:
#Transform categorical variables into numeric
df.columns = df.columns.str.strip()

df = pd.get_dummies(df, columns=['Country', 'Status'], drop_first=True)

print("Transformed Dataset:\n", df.head())

Transformed Dataset:
    Year  Life expectancy  Adult Mortality  infant deaths  Alcohol  \
0  2015             65.0            263.0             62     0.01   
1  2014             59.9            271.0             64     0.01   
2  2013             59.9            268.0             66     0.01   
3  2012             59.5            272.0             69     0.01   
4  2011             59.2            275.0             71     0.01   

   percentage expenditure  Hepatitis B  Measles   BMI  under-five deaths  ...  \
0               71.279624         65.0     1154  19.1                 83  ...   
1               73.523582         62.0      492  18.6                 86  ...   
2               73.219243         64.0      430  18.1                 89  ...   
3               78.184215         67.0     2787  17.6                 93  ...   
4                7.097109         68.0     3013  17.2                 97  ...   

   Country_United States of America  Country_Uruguay  Country_Uzbekistan  \


In [39]:
#ANOVA test to find significant features
df.columns = df.columns.str.strip()

numeric_cols = [
    'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 
    'Hepatitis B', 'Measles', 'BMI', 'under-five deaths', 'Polio', 
    'Total expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 
    'thinness  1-19 years', 'thinness 5-9 years', 'Income composition of resources', 
    'Schooling'
]

anova_results = {}
for col in numeric_cols:
    f_val, p_val = stats.f_oneway(df['Life expectancy'], df[col])
    anova_results[col] = p_val

significant_features = [col for col in numeric_cols if anova_results[col] < 0.05]
print("Significant features based on ANOVA test:", significant_features)

Significant features based on ANOVA test: ['Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years', 'thinness 5-9 years', 'Income composition of resources', 'Schooling']


In [40]:
#Feature Scaling
features_to_scale = [
    'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 
    'Hepatitis B', 'Measles', 'BMI', 'under-five deaths', 'Polio', 
    'Total expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 
    'thinness  1-19 years', 'thinness 5-9 years', 
    'Income composition of resources', 'Schooling'
]

scaler = StandardScaler()

df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

print(df[features_to_scale].head())

   Adult Mortality  infant deaths   Alcohol  percentage expenditure  \
0         0.792119       0.268824 -1.156989               -0.335570   
1         0.856601       0.285786 -1.156989               -0.334441   
2         0.832421       0.302749 -1.156989               -0.334594   
3         0.864662       0.328193 -1.156989               -0.332096   
4         0.888843       0.345155 -1.156989               -0.367862   

   Hepatitis B   Measles       BMI  under-five deaths     Polio  \
0    -0.783807 -0.110384 -0.967349           0.255359 -3.279423   
1    -0.914281 -0.168124 -0.992434           0.274060 -1.053699   
2    -0.827298 -0.173531 -1.017519           0.292761 -0.882489   
3    -0.696824  0.032045 -1.042605           0.317696 -0.668478   
4    -0.653333  0.051757 -1.062673           0.342631 -0.625675   

   Total expenditure  Diphtheria  HIV/AIDS       GDP  Population  \
0           0.931485   -0.735391 -0.323445 -0.453371    0.435183   
1           0.939818   -0.862233 -

In [41]:
#Linear Regression
df = pd.read_csv('Life Expectancy Data.csv')

df.columns = df.columns.str.strip()

num_cols_with_missing = [
    'Life expectancy', 'Adult Mortality', 'Alcohol', 'Hepatitis B', 'BMI',
    'Polio', 'Total expenditure', 'Diphtheria', 'GDP', 'Population',
    'thinness  1-19 years', 'thinness 5-9 years',
    'Income composition of resources', 'Schooling'
]

df['Life expectancy'] = df['Life expectancy'].fillna(df['Life expectancy'].median())
for col in num_cols_with_missing:
    df[col] = df[col].fillna(df[col].median())

df = pd.get_dummies(df, columns=['Country', 'Status'], drop_first=True)

features = [
    'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure',
    'Hepatitis B', 'Measles', 'BMI', 'under-five deaths', 'Polio',
    'Total expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population',
    'thinness  1-19 years', 'thinness 5-9 years',
    'Income composition of resources', 'Schooling'
]

features = [feature.strip() for feature in features]
X = df[features]
y = df['Life expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("Linear Regression R2:", r2)

pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))


Linear Regression R2: 0.8221111322545136


In [42]:
#Random Forest
df.columns = df.columns.str.strip()

num_cols_with_missing = [
    'Life expectancy', 'Adult Mortality', 'Alcohol', 'Hepatitis B', 'BMI',
    'Polio', 'Total expenditure', 'Diphtheria', 'GDP', 'Population',
    'thinness  1-19 years', 'thinness 5-9 years',
    'Income composition of resources', 'Schooling'
]

df['Life expectancy'] = df['Life expectancy'].fillna(df['Life expectancy'].median())
for col in num_cols_with_missing:
    df[col] = df[col].fillna(df[col].median())

features = [
    'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure',
    'Hepatitis B', 'Measles', 'BMI', 'under-five deaths', 'Polio',
    'Total expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population',
    'thinness  1-19 years', 'thinness 5-9 years',
    'Income composition of resources', 'Schooling'
]

features = [feature.strip() for feature in features]
X = df[features]
y = df['Life expectancy']

scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Random Forest R2 Score: {r2_score(y_test, y_pred):.2f}")


pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

Random Forest R2 Score: 0.97


In [43]:
#Testing with real world examples.
Adult_Mortality = 263
infant_deaths = 62
Alcohol = 0.01
percentage_expenditure = 71.27962362
Hepatitis_B = 65
Measles = 1154
BMI = 19.1
under_five_deaths = 83
Polio = 6
Total_expenditure = 8.16
Diphtheria = 65
HIV_AIDS = 0.1
GDP = 584.25921
Population = 33736494
thinness_1_19_years = 17.2
thinness_5_9_years = 17.3
Income_composition_of_resources = 0.479
Schooling = 10.1

CLF_model = pickle.load(open('model.pkl', "rb"))
scaler = pickle.load(open('scaler.pkl', 'rb'))

feature_names = [
    'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure',
    'Hepatitis B', 'Measles', 'BMI', 'under-five deaths', 'Polio',
    'Total expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population',
    'thinness  1-19 years', 'thinness 5-9 years',
    'Income composition of resources', 'Schooling'
]

input_features = pd.DataFrame([[
    Adult_Mortality, infant_deaths, Alcohol, percentage_expenditure, Hepatitis_B,
    Measles, BMI, under_five_deaths, Polio, Total_expenditure, Diphtheria,
    HIV_AIDS, GDP, Population, thinness_1_19_years, thinness_5_9_years,
    Income_composition_of_resources, Schooling
]], columns=feature_names)

scaled_features = scaler.transform(input_features)

result = CLF_model.predict(scaled_features)
print(result)

[63.954]
