**Task 1: Load the dataset, do basic data pre-processing, and split the dataset.**

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/Life Expectancy.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Life expectancy,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,65.0,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,59.9,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,59.9,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,59.5,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,59.2,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [2]:
# Describing dataset characteristics
data_description = data.describe(include='all')
data_info = data.info()

data_shape = data.shape
missing_values = data.isnull().sum()

data_description, data_shape, missing_values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2928 entries, 0 to 2927
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Life expectancy                  2928 non-null   float64
 1   Status                           2928 non-null   object 
 2   Adult Mortality                  2928 non-null   float64
 3   infant deaths                    2928 non-null   int64  
 4   Alcohol                          2928 non-null   float64
 5   percentage expenditure           2928 non-null   float64
 6   Hepatitis B                      2909 non-null   float64
 7   Measles                          2928 non-null   int64  
 8   BMI                              2928 non-null   float64
 9   under-five deaths                2928 non-null   int64  
 10  Polio                            2909 non-null   float64
 11  Total expenditure                2928 non-null   float64
 12  Diphtheria          

(        Life expectancy      Status  Adult Mortality  infant deaths  \
 count       2928.000000        2928      2928.000000    2928.000000   
 unique              NaN           2              NaN            NaN   
 top                 NaN  Developing              NaN            NaN   
 freq                NaN        2416              NaN            NaN   
 mean          69.224932         NaN       164.796448      30.407445   
 std            9.523867         NaN       124.292079     118.114450   
 min           36.300000         NaN         1.000000       0.000000   
 25%           63.100000         NaN        74.000000       0.000000   
 50%           72.100000         NaN       144.000000       3.000000   
 75%           75.700000         NaN       228.000000      22.000000   
 max           89.000000         NaN       723.000000    1800.000000   
 
             Alcohol  percentage expenditure  Hepatitis B        Measles  \
 count   2928.000000             2928.000000  2909.000000 

In [3]:
# Handling missing values by replacing them with the median of the respective columns
data['Hepatitis B'].fillna(data['Hepatitis B'].median(), inplace=True)
data['Polio'].fillna(data['Polio'].median(), inplace=True)
data['Diphtheria'].fillna(data['Diphtheria'].median(), inplace=True)

# Handling categorical variables using one-hot encoding
data = pd.get_dummies(data, columns=['Status'], drop_first=True)

# Confirming changes
data.head()

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Status_Developing
0,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,1
1,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,1
2,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,1
3,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,1
4,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,1


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Splitting the dataset
X = data.drop('Life expectancy', axis=1)
y = data['Life expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Normalizing the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


**Task 2: Train and evaluate the two regression models**

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

# Initializing Linear Regression and SVM regression models with default settings
linear_reg = LinearRegression()
svm_reg = SVR()

linear_reg, svm_reg

(LinearRegression(), SVR())

In [6]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Applying 5-fold cross-validation on Linear Regression
linear_reg_scores = cross_val_score(linear_reg, X_train_scaled, y_train, cv=5)

# Applying 5-fold cross-validation on SVM Regression
svm_reg_scores = cross_val_score(svm_reg, X_train_scaled, y_train, cv=5)

linear_reg_avg_score = np.mean(linear_reg_scores)
svm_reg_avg_score = np.mean(svm_reg_scores)

linear_reg_avg_score, svm_reg_avg_score


(0.8299680729241388, 0.8504614494034953)

In [7]:
from sklearn.model_selection import GridSearchCV

# Parameters for SVM regression tuning
svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Grid search for SVM regression
svm_grid_search = GridSearchCV(svm_reg, svm_params, cv=5)
svm_grid_search.fit(X_train_scaled, y_train)

# Best parameters and score for SVM regression
svm_best_params = svm_grid_search.best_params_
svm_best_score = svm_grid_search.best_score_

svm_best_params, svm_best_score


({'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}, 0.9258131853759576)

In [8]:
# Training and evaluating Linear Regression on the test set
linear_reg.fit(X_train_scaled, y_train)
linear_reg_test_score = linear_reg.score(X_test_scaled, y_test)

# Training and evaluating SVM Regression with best parameters on the test set
optimized_svm_reg = SVR(C=100, gamma='scale', kernel='rbf')
optimized_svm_reg.fit(X_train_scaled, y_train)
svm_reg_test_score = optimized_svm_reg.score(X_test_scaled, y_test)

linear_reg_test_score, svm_reg_test_score


(0.8617610564538393, 0.9535974600051537)