# Imports

In [None]:
import numpy as np  # For array handling
import matplotlib.pyplot as plt  # For fig plotting
import pandas as pd  # For data frames handling
pd.set_option('display.max_columns', None)  # For showing all the columns
from sklearn.linear_model import LinearRegression  # For the regression model
from sklearn.metrics import mean_squared_error, r2_score  # For metrics calculations

# Data Loading

In [None]:
Training = pd.read_csv('Dataset/train_clean.csv')  # Training data
Testing = pd.read_csv('Dataset/test_clean.csv')  # Testing data

In [None]:
Training.head()  # Show training data header

# Prediction Function

In [None]:
# Prediction Function (Regression Model)
def predict(x, w, b):
    return np.dot(x, w) + b

# Cost Function

In [None]:
# Calculate MSE (Withoit division by 2 to match the sklearn function)
def cost_function(x, w, b, y):
    m = len(x)  # Number of training examples
    y_hat = predict(x, w, b)  # Model Prediction
    return (1 / (m)) * np.sum(np.pow((y_hat - y), 2))  # MSE Formula

# Gradient Derivative

In [None]:
# Calculate the derivatives to update the parameters during training
def gradient_derivatives(x, w, b, y):
    m = len(x)
    y_hat = predict(x, w, b)
    dw = (1 / m) * np.sum((y_hat - y) * x)  # derivative wrt w (regression coeff.)
    db = (1 / m) * np.sum(y_hat - y)  # derivative wrt b (bias)
    return dw, db

# Gradient Descent

In [None]:
# update model parameters during training to reach convergence
def gradient_descent(x, w, b, y, lr, num_iters):
    cost_history = []  # store cost history for plotting
    for i in range(num_iters):
        dw, db = gradient_derivatives(x, w, b, y)
        w -= lr * dw  # parameter update
        b -= lr * db  # parameter update

        cost = cost_function(x, w, b, y)  # Compute the cost
        cost_history.append(cost)  # Add the cost to the history
        print(f"Cost at iteration {i} = {cost}")

    return w, b, cost_history


# Train the Model (Regression from scratch)

In [None]:
# ==========================================
# 1. HELPER FUNCTIONS
# ==========================================
def compute_mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def compute_r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)  # Sum of Squared Residuals (Errors)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)  # Total Sum of Squares (Variance from the mean)
    
    # Avoid division by zero if variance is 0
    if ss_tot == 0:
        return 0.0
        
    return 1 - (ss_res / ss_tot)

# ==========================================
# 2. DATA SETUP
# ==========================================
target_col = "Life_expectancy"

# convert pandas data frame into numpy arrays
y_train = Training[target_col].values
y_test = Testing[target_col].values

# Get list of feature names
feature_columns = [col for col in Training.columns if col != target_col]

# Hyperparameters
lr = 0.01  # learning rate
num_iters = 2000  # number of iterations

results_list = []  # list to store training results

print("Starting evaluation (regression from scratch)")

# ==========================================
# 3. Evaluation LOOP
# ==========================================
for feature_name in feature_columns:
    
    # --- A. PREPARE DATA ---
    X_train_feat = Training[feature_name].values
    X_test_feat = Testing[feature_name].values
    
    # --- B. TRAIN (Gradient Descent) ---
    # Always reset weights to 0 for unbiased comparison
    w_init = 0.0
    b_init = 0.0
    
    w_final, b_final, cost_history = gradient_descent(X_train_feat, w_init, b_init, y_train, lr, num_iters)
    
    # --- C. TEST (Prediction & Metrics) ---
    y_pred_test = predict(X_test_feat, w_final, b_final)
    
    # Calculate Metrics
    test_mse = compute_mse(y_test, y_pred_test)
    test_r2 = compute_r2_score(y_test, y_pred_test)
    
    # --- D. SAVE RESULTS ---
    results_list.append({
        "Feature": feature_name,
        "Weight_w": w_final,                # The Weight (Slope)
        "Bias_b": b_final,                  # TThe Bias (Intercept)
        "Train_MSE_Cost": cost_history[-1], # Final error on training
        "Test_MSE_Cost": test_mse,          # Final error on testing
        "Test_R2_Score": test_r2            # "Accuracy"
    })

# ==========================================
# 4. SAVE TO CSV
# ==========================================
results_df = pd.DataFrame(results_list)

# Sort descending by R2 Score
results_df = results_df.sort_values(by="Test_R2_Score", ascending=False)

filename = "feature_performance_metrics.csv"
results_df.to_csv(filename, index=False)

print(f"Success! Analysis saved to '{filename}'.")

# Train the Model (Regression using sklearn)

In [None]:
# ==========================================
# 1. DATA SETUP
# ==========================================
target_col = "Life_expectancy"
feature_columns = [col for col in Training.columns if col != target_col]

# Define Targets
y_train = Training[target_col].values
y_test = Testing[target_col].values

results_list = []

print("Starting evaluation (regression using built in functions)")

# ==========================================
# 2. Evaluation LOOP
# ==========================================
for feature_name in feature_columns:
    
    # --- A. PREPARE DATA  ---
    # (Reshaping is Critical for Sklearn)
    # Sklearn expects a 2D array
    X_train_feat = Training[feature_name].values.reshape(-1, 1)
    X_test_feat = Testing[feature_name].values.reshape(-1, 1)
    
    # --- B. TRAIN (Linear Regression) ---
    # Initialize the model
    model = LinearRegression()
    
    # Fit on the training data
    model.fit(X_train_feat, y_train)
    
    w_final = model.coef_[0]   # The Weight (Slope)
    b_final = model.intercept_ # The Bias (Intercept)
    
    # --- C. TEST (Prediction & Metrics) ---
    # Predict on unseen TESTING data
    y_pred_train = model.predict(X_train_feat) # To calculate Train MSE
    y_pred_test = model.predict(X_test_feat)   # To calculate Test Metrics
    
    # Calculate Built-in Metrics
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_r2 = r2_score(y_test, y_pred_test)
    
    # --- D. SAVE RESULTS ---
    results_list.append({
        "Feature": feature_name,
        "Weight_w": w_final,
        "Bias_b": b_final,
        "Train_MSE_Cost": train_mse,
        "Test_MSE_Cost": test_mse,
        "Test_R2_Score": test_r2
    })

# ==========================================
# 3. SAVE TO CSV
# ==========================================
results_df = pd.DataFrame(results_list)

# Sort by R2 Score to match the from scratch one
results_df = results_df.sort_values(by="Test_R2_Score", ascending=False)

filename = "feature_performance_sklearn.csv"
results_df.to_csv(filename, index=False)

print(f"Success! Analysis saved to '{filename}'.")

# Generate figures to compare the results

In [None]:
# 1. Load the two result files
manual_df = pd.read_csv("feature_performance_metrics.csv")
sklearn_df = pd.read_csv("feature_performance_sklearn.csv")

# Set 'Feature' as index
manual_df.set_index("Feature", inplace=True)
sklearn_df.set_index("Feature", inplace=True)

# 2. Loop through every feature
print("Generating plots...")

for feature_name in manual_df.index:
        
    # --- A. PREPARE DATA ---
    X = Training[feature_name].values
    y = Training["Life_expectancy"].values
    
    # --- B. GET WEIGHTS & BIAS ---
    # Manual Model
    w_man = manual_df.loc[feature_name, "Weight_w"]
    b_man = manual_df.loc[feature_name, "Bias_b"]
    r2_man = manual_df.loc[feature_name, "Test_R2_Score"]
    
    # Sklearn Model
    w_sk = sklearn_df.loc[feature_name, "Weight_w"]
    b_sk = sklearn_df.loc[feature_name, "Bias_b"]
    r2_sk = sklearn_df.loc[feature_name, "Test_R2_Score"]
    
    # --- C. CALCULATE LINES ---
    # Create a range of X values
    x_line = np.linspace(X.min(), X.max(), 100)
    
    y_line_man = w_man * x_line + b_man
    y_line_sk = w_sk * x_line + b_sk
    
    # --- D. PLOT ---
    plt.figure(figsize=(10, 6))
    
    # 1. Scatter Plot the Actual Data
    plt.scatter(X, y, color='lightgray', label='Training Data', alpha=0.6)
    
    # 2. Manual Regression Line
    plt.plot(x_line, y_line_man, color='blue', linewidth=3, alpha=0.7, label=f'Manual Model (R²={r2_man:.2f})')
    
    # 3. Sklearn Regression Line
    plt.plot(x_line, y_line_sk, color='red', linestyle='--', linewidth=2, label=f'Sklearn Model (R²={r2_sk:.2f})')
    
    # Formatting
    plt.title(f"Regression Comparison: {feature_name}", fontsize=14)
    plt.xlabel(feature_name, fontsize=12)
    plt.ylabel("Life Expectancy", fontsize=12)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # --- E. SAVE ---
    # Clean filename
    plt.savefig(f"plot_{feature_name}.png")
    plt.close() # Close memory to prevent crashing on many loops

print("Success! Plots have been saved for all features.")