In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import pickle

In [2]:
file_path = '/content/Life Expectancy Data.csv'
data = pd.read_csv(file_path)

In [3]:
data_filled = data.fillna(data.mean())

In [4]:
threshold = 3
z_scores = np.abs(stats.zscore(data_filled))
data_no_outliers = data_filled[(z_scores < threshold).all(axis=1)]

In [12]:
scaler = MinMaxScaler()

In [22]:
print(data_no_outliers.columns)

Index(['Year', 'Life expectancy ', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')


In [23]:
data_no_outliers.columns = data_no_outliers.columns.str.strip()

In [24]:
if 'Life expectancy' in data_no_outliers.columns:
    scaled_data = scaler.fit_transform(data_no_outliers.drop(columns=['Life expectancy']))
else:
    print("'Life expectancy' column not found!")

In [25]:
y = data_no_outliers['Life expectancy']

In [26]:
X = scaled_data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
model_multiple = LinearRegression()

In [30]:
model_multiple.fit(X_train, y_train)

In [31]:
y_pred_multiple = model_multiple.predict(X_test)

In [34]:
mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)
print("\nMultiple Linear Regression")
print(f"  Mean Squared Error: {mse_multiple:.4f}")
print(f"  R² Score: {r2_multiple:.4f}")
print("Intercept:", model_multiple.intercept_)
print(f"  Coefficients: {model_multiple.coef_}")


Multiple Linear Regression
  Mean Squared Error: 11.4483
  R² Score: 0.8483
Intercept: 56.97078862716718
  Coefficients: [  1.06612211  -8.33471703  27.81201012   0.38618859   2.47817654
  -2.81604449   0.05806632  -0.37302702 -30.40114899   2.7187973
   2.6584802    5.15746618 -11.30306909  -0.99778646   1.40731999
   2.08159109  -4.0180467   22.54295921  -4.78988747]


In [36]:
import pickle
with open('multiple_linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model_multiple, f)

In [37]:
with open('multiple_linear_regression_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [38]:
sample_predictions = loaded_model.predict(X[:5])
sample_data = pd.DataFrame({
    'Actual Life expectancy': y_test.values,
    'Predicted Life expectancy': y_pred_multiple,
    'Difference': y_test.values - y_pred_multiple
})

In [39]:
print(sample_data)

     Actual Life expectancy  Predicted Life expectancy  Difference
0                      71.8                  73.367738   -1.567738
1                      79.1                  79.196547   -0.096547
2                      76.8                  73.026817    3.773183
3                      68.6                  68.810416   -0.210416
4                      78.2                  73.708370    4.491630
..                      ...                        ...         ...
432                    70.0                  73.546446   -3.546446
433                    81.8                  82.322673   -0.522673
434                    69.9                  71.645257   -1.745257
435                    75.0                  75.828193   -0.828193
436                    49.7                  59.417602   -9.717602

[437 rows x 3 columns]
