# Assignment 2 - Linear Regression

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [27]:
# Load the data
data = pd.read_csv('life_expectancy.csv')

# Define the numerical features
numeric_features = ['Life expectancy ', 'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']

# Apply log transformation to reduce skewness (only to positive data)
data_log_transformed = data.copy()
for column in numeric_features:
    data_log_transformed[column] = np.log1p(data[column])  # log1p is log(1 + x) to handle zero values

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the log-transformed features
data_scaled = data_log_transformed.copy()
data_scaled[numeric_features] = scaler.fit_transform(data_log_transformed[numeric_features])

# Define different sets of features for each model
features_model_1 = ['Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', ' BMI ']
features_model_2 = ['under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS']
features_model_3 = ['GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']

target = 'Life expectancy '

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_scaled, data_scaled[target], test_size=0.2, random_state=42)

# Function to train and evaluate a model
def train_and_evaluate(features):
    X_train_subset = X_train[features]
    X_test_subset = X_test[features]
    
    # Train the linear regression model
    model = LinearRegression()
    model.fit(X_train_subset, y_train)
    
    # Predict the values
    y_train_pred = model.predict(X_train_subset)
    y_test_pred = model.predict(X_test_subset)
    
    # Calculate performance metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    return train_mse, test_mse, train_r2, test_r2

# Train and evaluate each model
train_mse_1, test_mse_1, train_r2_1, test_r2_1 = train_and_evaluate(features_model_1)
train_mse_2, test_mse_2, train_r2_2, test_r2_2 = train_and_evaluate(features_model_2)
train_mse_3, test_mse_3, train_r2_3, test_r2_3 = train_and_evaluate(features_model_3)

# Print the performance metrics for each model
print("Model 1 Performance:")
print(f"Training MSE: {train_mse_1:.4f}")
print(f"Testing MSE: {test_mse_1:.4f}")
print(f"Training R²: {train_r2_1:.4f}")
print(f"Testing R²: {test_r2_1:.4f}\n")

print("Model 2 Performance:")
print(f"Training MSE: {train_mse_2:.4f}")
print(f"Testing MSE: {test_mse_2:.4f}")
print(f"Training R²: {train_r2_2:.4f}")
print(f"Testing R²: {test_r2_2:.4f}\n")

print("Model 3 Performance:")
print(f"Training MSE: {train_mse_3:.4f}")
print(f"Testing MSE: {test_mse_3:.4f}")
print(f"Training R²: {train_r2_3:.4f}")
print(f"Testing R²: {test_r2_3:.4f}\n")

Model 1 Performance:
Training MSE: 0.5267
Testing MSE: 0.4647
Training R²: 0.4847
Testing R²: 0.4865

Model 2 Performance:
Training MSE: 0.2835
Testing MSE: 0.2656
Training R²: 0.7227
Testing R²: 0.7065

Model 3 Performance:
Training MSE: 0.4383
Testing MSE: 0.3751
Training R²: 0.5712
Testing R²: 0.5855



##### Outliers

I chose not to remove outliers from the data. When comparing the summary statistics such as mean, median, and mode, I found that removing outliers did not significantly impact the data. There were very small changes in the skewness of the data, but not enough to consider the outliers significant. 

Some summary statistics before and after cleaning:

1.  Column:  Adult Mortality
    Mean:  168.2152819890843
    Mean:  168.5523576240049
    Median:  148.0
    Median:  149.0
    Standard Deviation:  125.31041693156168
    Standard Deviation:  125.36163808096406
    Variance:  15702.70059156182
    Variance:  15715.540302342617
    Skewness:  1.2764291138227022
    Skewness:  1.2798923849971515

2.  Column:  infant deaths
    Mean:  32.55306246209824
    Mean:  32.76607470912431
    Median:  3.0
    Median:  3.0
    Standard Deviation:  120.8471904963888
    Standard Deviation:  121.40486532202739
    Variance:  14604.043450870486
    Variance:  14739.141323859609
    Skewness:  8.477369411660424
    Skewness:  8.438009114572576
    
    As you can see, none of the summary statistic greatly changed from cleaning out outliers.

##### Cetering and Scaling

I chose to center and scale the data in this csv. Scaling data is very important for training a linear regression model. Some of the columns such as "Measles" and "Population" have very large values compared to some of the other columns in the CSV. These columns must be scaled down before traning the model.

##### Transformation

I decided to log transform the data to help combat one of the linear regression statistical assumption violations. This data is no linear and log transformations can help to correct this. 

##### Testing the Model

My model scored pretty well in its tests. Its R^2 score was good and its MSE was also good. This means that my model fits to the data fairly well and is not overfitting. 

##### Comparing models
Model 2 performed the best with the lowest MSE value and highest R^2 value. 
