# Imports

In [None]:

import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns

# ML tools 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

# General display options
pd.set_option("display.max_columns", None)    # by default pandas cut off columns if there are too many
pd.set_option("display.precision", 3)         # coonverts 0.12345678 → 0.123 to avoid table congestion


# Plot style
sns.set(style="whitegrid")                    # controls plot visual style





# Dataset Loading

In [None]:
DATA_PATH = "/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset path:", DATA_PATH)
print("Dataset shape:", df.shape)

df.head()

# Missing values

In [None]:
print("Column names:\n", df.columns.tolist(), "\n")

print("Basic info:\n")
df.info()

print("\nMissing values per column:\n")
print(df.isna().sum())

print("\nDescriptive statistics (numeric columns):\n")
display(df.describe())

# Target and features

In [None]:
target_column = "Performance Index"

X = df.drop(columns=[target_column])
y = df[target_column]


X.to_csv("features_raw.csv", index=False)
y.to_csv("target_raw.csv", index=False)

X.shape, y.shape

# EDA and correlation

In [None]:
corr = df.corr(numeric_only=True)
corr.to_csv("correlation_matrix.csv")

# Heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

plt.savefig("correlation_heatmap.png", bbox_inches="tight")
plt.close()

# Train test split and scaling

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

X_encoded = X.copy()

for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col])
        joblib.dump(le, f"{col}_label_encoder.pkl")

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y, 
    test_size=0.2,
    random_state=42
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pd.DataFrame(X_train_scaled, columns=X_train.columns).to_csv("X_train_scaled.csv", index=False)
pd.DataFrame(X_test_scaled, columns=X_train.columns).to_csv("X_test_scaled.csv", index=False)

y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

joblib.dump(scaler, "scaler.pkl")

X_train_scaled.shape, X_test_scaled.shape


# PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
X_train_pca_full = pca.fit_transform(X_train_scaled)

variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_ratio)

pd.DataFrame({
    "Component": np.arange(1, len(variance_ratio)+1),
    "Explained Variance": variance_ratio,
    "Cumulative Variance": cumulative_variance
}).to_csv("pca_variance.csv", index=False)

plt.figure()
plt.plot(cumulative_variance, marker='o')
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Explained Variance")

plt.savefig("pca_explained_variance.png", bbox_inches="tight")
plt.close()

n_components = np.argmax(cumulative_variance >= 0.95) + 1


pca_final = PCA(n_components=n_components)
X_train_pca = pca_final.fit_transform(X_train_scaled)
X_test_pca = pca_final.transform(X_test_scaled)

pd.DataFrame(X_train_pca).to_csv("X_train_pca.csv", index=False)
pd.DataFrame(X_test_pca).to_csv("X_test_pca.csv", index=False)

n_components

# Linear Regression

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# using SKlearn
lr = LinearRegression()
lr.fit(X_train_pca, y_train)
y_pred_lr = lr.predict(X_test_pca)

sk_mse = mean_squared_error(y_test, y_pred_lr)
sk_r2 = r2_score(y_test, y_pred_lr)


In [None]:
# Fromm scratch(creating class)
def linear_regression_scratch(X, y):
    X = np.c_[np.ones(X.shape[0]), X]  
    theta = np.linalg.inv(X.T @ X) @ X.T @ y
    return theta

def predict_scratch(X, theta):
    X = np.c_[np.ones(X.shape[0]), X]
    return X @ theta

theta = linear_regression_scratch(X_train_pca, y_train.values)

y_pred_scratch = predict_scratch(X_test_pca, theta)

scratch_mse = mean_squared_error(y_test, y_pred_scratch)
scratch_r2 = r2_score(y_test, y_pred_scratch)



In [None]:
# Save predictions
pd.DataFrame({
    "Actual": y_test.values,
    "Scratch_Prediction": y_pred_scratch,
    "Sklearn_Prediction": y_pred_lr
}).to_csv("linear_regression_predictions.csv", index=False)

# Save metrics
pd.DataFrame({
    "Model": ["Linear Regression Scratch", "Linear Regression Sklearn"],
    "MSE": [scratch_mse, sk_mse],
    "R2": [scratch_r2, sk_r2]
}).to_csv("linear_regression_results.csv", index=False)

scratch_mse, scratch_r2

In [None]:
print("\nLinear Regression Results:")
print(" ")
print(f"Scratch Model → MSE: {scratch_mse:.4f} | R²: {scratch_r2:.4f}")
print(f"Sklearn Model → MSE: {sk_mse:.4f} | R²: {sk_r2:.4f}")


# Ridge and Lasso regression

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

ridge_params = {"alpha": [0.01, 0.1, 1, 10, 100]}
lasso_params = {"alpha": [0.01, 0.05, 0.1, 1]}

ridge = GridSearchCV(Ridge(), ridge_params, cv=5, scoring="r2")
ridge.fit(X_train_pca, y_train)
ridge_best = ridge.best_estimator_


lasso = GridSearchCV(Lasso(), lasso_params, cv=5, scoring="r2")
lasso.fit(X_train_pca, y_train)
lasso_best = lasso.best_estimator_


ridge_pred = ridge_best.predict(X_test_pca)
lasso_pred = lasso_best.predict(X_test_pca)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test, ridge_pred)

lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

# Saving results
pd.DataFrame({
    "Model": ["Ridge", "Lasso"],
    "Best Alpha": [ridge.best_params_["alpha"], lasso.best_params_["alpha"]],
    "MSE": [ridge_mse, lasso_mse],
    "R2": [ridge_r2, lasso_r2]
}).to_csv("ridge_lasso_results.csv", index=False)

ridge_mse, ridge_r2, lasso_mse, lasso_r2


In [None]:
print("\nRegularization Results:")
print("  ")
print(f"Ridge → Best Alpha: {ridge.best_params_['alpha']} | MSE: {ridge_mse:.4f} | R²: {ridge_r2:.4f}")
print(f"Lasso → Best Alpha: {lasso.best_params_['alpha']} | MSE: {lasso_mse:.4f} | R²: {lasso_r2:.4f}")

# KNN

In [None]:
# KNN built from scratch 

def knn_predict(X_train, y_train, X_test, k=5):
    predictions = []
    for test_point in X_test:
        distances = np.sqrt(((X_train - test_point) ** 2).sum(axis=1))
        k_indices = np.argsort(distances)[:k]
        k_values = y_train.iloc[k_indices]
        predictions.append(k_values.mean())
    return np.array(predictions)

k = 5

# predictions
y_pred_knn_scratch = knn_predict(X_train_pca, y_train, X_test_pca, k=k)

scratch_mse = mean_squared_error(y_test, y_pred_knn_scratch)
scratch_r2 = r2_score(y_test, y_pred_knn_scratch)

In [None]:
# KNN using class

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train_pca, y_train)
y_pred_knn_sk = knn.predict(X_test_pca)

sk_mse = mean_squared_error(y_test, y_pred_knn_sk)
sk_r2 = r2_score(y_test, y_pred_knn_sk)

In [None]:
# Saving results

pd.DataFrame({
    "Actual": y_test.values,
    "Scratch_kNN": y_pred_knn_scratch,
    "Sklearn_kNN": y_pred_knn_sk
}).to_csv("knn_predictions.csv", index=False)

pd.DataFrame({
    "Model": ["kNN Scratch", "kNN Sklearn"],
    "MSE": [scratch_mse, sk_mse],
    "R2": [scratch_r2, sk_r2]
}).to_csv("knn_results.csv", index=False)

In [None]:
print("\nkNN Regression Results")
print(" ")
print(f"Scratch kNN → MSE: {scratch_mse:.4f} | R²: {scratch_r2:.4f}")
print(f"Sklearn kNN → MSE: {sk_mse:.4f} | R²: {sk_r2:.4f}")

# Decision tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Training
tree = DecisionTreeRegressor(max_depth=5, random_state=42)
tree.fit(X_train_pca, y_train)

# Prediction
y_pred_tree = tree.predict(X_test_pca)

# Metrics
tree_mse = mean_squared_error(y_test, y_pred_tree)
tree_r2 = r2_score(y_test, y_pred_tree)

In [None]:
# Save results
pd.DataFrame({
    "Actual": y_test.values,
    "Decision_Tree_Prediction": y_pred_tree
}).to_csv("decision_tree_predictions.csv", index=False)

pd.DataFrame({
    "Model": ["Decision Tree"],
    "max_depth": [5],
    "MSE": [tree_mse],
    "R2": [tree_r2]
}).to_csv("decision_tree_results.csv", index=False)

In [None]:
# Print results
print("\nDecision Tree Results")
print(" " * 40)
print(f"Decision Tree → MSE: {tree_mse:.4f} | R²: {tree_r2:.4f}")


In [None]:
# Running GridSearchCV
param_grid = {
    "max_depth": [3, 5, 8, 10, 15, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": ["sqrt", "log2", None]
}

grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

grid.fit(X_train_pca, y_train)

best_tree = grid.best_estimator_

y_pred_best_tree = best_tree.predict(X_test_pca)

best_mse = mean_squared_error(y_test, y_pred_best_tree)
best_r2 = r2_score(y_test, y_pred_best_tree)

# Save best result
pd.DataFrame({
    "Best Params": [str(grid.best_params_)],
    "MSE": [best_mse],
    "R2": [best_r2]
}).to_csv("decision_tree_best_results.csv", index=False)

# Print clean output
print("\nDecision Tree After Hyperparameter Tuning")
print("-" * 50)
print("Best Parameters:", grid.best_params_)
print(f"Improved MSE: {best_mse:.4f}")
print(f"Improved R²: {best_r2:.4f}")


# Support vector Machine (Regression)

In [None]:
# using linear SVR without kernels here
def svr_scratch(X, y, lr=0.001, epochs=1000, C=1.0, epsilon=0.1):
    n, d = X.shape
    w = np.zeros(d)
    b = 0

    for _ in range(epochs):
        for i in range(n):
            y_pred = np.dot(X[i], w) + b
            error = y[i] - y_pred

            if abs(error) > epsilon:
                w -= lr * (2 * w - C * X[i] * np.sign(error))
                b -= lr * (-C * np.sign(error))

    return w, b

def svr_predict(X, w, b):
    return np.dot(X, w) + b

# Train scratch SVR
w, b = svr_scratch(X_train_pca, y_train.values)

y_pred_svm_scratch = svr_predict(X_test_pca, w, b)

scratch_mse = mean_squared_error(y_test, y_pred_svm_scratch)
scratch_r2 = r2_score(y_test, y_pred_svm_scratch)

In [None]:
from sklearn.svm import SVR
svr = SVR(kernel="rbf", C=10, epsilon=0.1)
svr.fit(X_train_pca, y_train)

y_pred_svm_sk = svr.predict(X_test_pca)

sk_mse = mean_squared_error(y_test, y_pred_svm_sk)
sk_r2 = r2_score(y_test, y_pred_svm_sk)

In [None]:
# Save predictions
pd.DataFrame({
    "Actual": y_test.values,
    "Scratch_SVR": y_pred_svm_scratch,
    "Sklearn_SVR": y_pred_svm_sk
}).to_csv("svr_predictions.csv", index=False)

# Save metrics
pd.DataFrame({
    "Model": ["SVR Scratch", "SVR Sklearn"],
    "MSE": [scratch_mse, sk_mse],
    "R2": [scratch_r2, sk_r2]
}).to_csv("svr_results.csv", index=False)

In [None]:
print("\nSupport Vector Regression Results")
print("  ")
print(f"Scratch SVR → MSE: {scratch_mse:.4f} | R²: {scratch_r2:.4f}")
print(f"Sklearn SVR → MSE: {sk_mse:.4f} | R²: {sk_r2:.4f}")

## Kernels -  RBF,Linear using CV

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

# -------- Linear SVR --------
svr_linear = SVR(kernel="linear", C=1.0, epsilon=0.1)
svr_linear.fit(X_train_pca, y_train)

y_pred_linear = svr_linear.predict(X_test_pca)
linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_r2 = r2_score(y_test, y_pred_linear)

# -------- RBF SVR with light Random Search --------
param_dist = {
    "C": loguniform(0.1, 100),
    "gamma": loguniform(0.001, 1),
    "epsilon": [0.01, 0.1, 0.2]
}

svr_rbf = SVR(kernel="rbf")

random_search = RandomizedSearchCV(
    svr_rbf,
    param_distributions=param_dist,
    n_iter=10,      # kept small for speed
    cv=3,           # enough for demo
    scoring="r2",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_pca, y_train)
best_rbf = random_search.best_estimator_

y_pred_rbf = best_rbf.predict(X_test_pca)
rbf_mse = mean_squared_error(y_test, y_pred_rbf)
rbf_r2 = r2_score(y_test, y_pred_rbf)

# Save kernel comparison
pd.DataFrame({
    "Model": ["Linear SVR", "RBF SVR"],
    "MSE": [linear_mse, rbf_mse],
    "R2": [linear_r2, rbf_r2]
}).to_csv("svr_kernel_comparison.csv", index=False)

# Human-readable print
print("\nSVR Kernel Comparison")
print(" " * 40)
print(f"Linear SVR → MSE: {linear_mse:.4f} | R²: {linear_r2:.4f}")
print(f"RBF SVR    → MSE: {rbf_mse:.4f} | R²: {rbf_r2:.4f}")
print("\nBest RBF Parameters:", random_search.best_params_)


#### why linear and not RBF based kernel
After scaling and PCA, Linear SVR (R² = 0.9890) and tuned RBF SVR (R² = 0.9889) performed almost identically.
This shows that the relationship between features and performance index is mostly linear, and the added complexity of a non-linear kernel does not provide meaningful improvement here.
Therefore, Linear SVR is preferred for this dataset due to its simplicity and interpretability.

# AdaBoost

In [None]:
# Train AdaBoost model
ada = AdaBoostRegressor(
    n_estimators=200,
    learning_rate=0.1,
    random_state=42
)

ada.fit(X_train_pca, y_train)

# Predict
y_pred_ada = ada.predict(X_test_pca)

# Metrics
ada_mse = mean_squared_error(y_test, y_pred_ada)
ada_r2 = r2_score(y_test, y_pred_ada)

# Predictions Saving
pd.DataFrame({
    "Actual": y_test.values,
    "AdaBoost_Prediction": y_pred_ada
}).to_csv("adaboost_predictions.csv", index=False)



In [None]:
# Save results
pd.DataFrame({
    "Model": ["AdaBoost"],
    "Estimators": [200],
    "Learning_Rate": [0.1],
    "MSE": [ada_mse],
    "R2": [ada_r2]
}).to_csv("adaboost_results.csv", index=False)

# Print results
print("\nAdaBoost Results")
print(" ")
print(f"MSE: {ada_mse:.4f} | R²: {ada_r2:.4f}")


# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Train Gradient Boosting model
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gbr.fit(X_train_pca, y_train)

# Predict
y_pred_gbr = gbr.predict(X_test_pca)

# Metrics
gbr_mse = mean_squared_error(y_test, y_pred_gbr)
gbr_r2 = r2_score(y_test, y_pred_gbr)

# Save predictions
pd.DataFrame({
    "Actual": y_test.values,
    "GradientBoost_Prediction": y_pred_gbr
}).to_csv("gradient_boosting_predictions.csv", index=False)

# Save results
pd.DataFrame({
    "Model": ["Gradient Boosting"],
    "Estimators": [300],
    "Learning_Rate": [0.05],
    "Max_Depth": [3],
    "MSE": [gbr_mse],
    "R2": [gbr_r2]
}).to_csv("gradient_boosting_results.csv", index=False)

# Print results
print("\nGradient Boosting Results")
print(" ")
print(f"MSE: {gbr_mse:.4f} | R²: {gbr_r2:.4f}")


# Manual Stacking

In [None]:
from sklearn.ensemble import StackingRegressor

# Base models used
base_models = [
    ("lr", LinearRegression()),
    ("svr", SVR(C=10, epsilon=0.1)),
    ("dt", DecisionTreeRegressor(max_depth=5)),
    ("knn", KNeighborsRegressor(n_neighbors=5))
]

# Stacking using sklearn.ensemble
meta_model = LinearRegression()

stack = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model
)

stack.fit(X_train_pca, y_train)

y_pred_stack = stack.predict(X_test_pca)

stack_mse = mean_squared_error(y_test, y_pred_stack)
stack_r2 = r2_score(y_test, y_pred_stack)

In [None]:
# Generate base predictions manually(manual stacking)
base_preds_train = np.column_stack([
    model.fit(X_train_pca, y_train).predict(X_train_pca)
    for _, model in base_models
])

base_preds_test = np.column_stack([
    model.fit(X_train_pca, y_train).predict(X_test_pca)
    for _, model in base_models
])

# Train meta-learner manually
meta = LinearRegression()
meta.fit(base_preds_train, y_train)

y_pred_manual_stack = meta.predict(base_preds_test)

manual_mse = mean_squared_error(y_test, y_pred_manual_stack)
manual_r2 = r2_score(y_test, y_pred_manual_stack)

In [None]:
# Save predictions
pd.DataFrame({
    "Actual": y_test.values,
    "Sklearn_Stacking": y_pred_stack,
    "Manual_Stacking": y_pred_manual_stack
}).to_csv("stacking_predictions.csv", index=False)

# Save results
pd.DataFrame({
    "Model": ["Sklearn Stacking", "Manual Stacking"],
    "MSE": [stack_mse, manual_mse],
    "R2": [stack_r2, manual_r2]
}).to_csv("stacking_results.csv", index=False)

In [None]:
# Print results
print("\nStacking Ensemble Results")
print(" ")
print(f"Sklearn Stacking → MSE: {stack_mse:.4f} | R²: {stack_r2:.4f}")
print(f"Manual Stacking → MSE: {manual_mse:.4f} | R²: {manual_r2:.4f}")


# Final results comparison

In [None]:
# Load all result files
linear_results = pd.read_csv("linear_regression_results.csv")
ridge_lasso_results = pd.read_csv("ridge_lasso_results.csv")
knn_results = pd.read_csv("knn_results.csv")
tree_results = pd.read_csv("decision_tree_results.csv")
svr_results = pd.read_csv("svr_results.csv")
ada_results = pd.read_csv("adaboost_results.csv")
gbr_results = pd.read_csv("gradient_boosting_results.csv")
stack_results = pd.read_csv("stacking_results.csv")

# Normalize column naming for merging
ridge_lasso_results.rename(columns={"Best Alpha": "Info"}, inplace=True)
ada_results.rename(columns={"Estimators": "Info"}, inplace=True)
gbr_results.rename(columns={"Estimators": "Info"}, inplace=True)

# Select key columns only
final_table = pd.concat([
    linear_results[["Model", "MSE", "R2"]],
    ridge_lasso_results[["Model", "MSE", "R2"]],
    knn_results[["Model", "MSE", "R2"]],
    tree_results[["Model", "MSE", "R2"]],
    svr_results[["Model", "MSE", "R2"]],
    ada_results[["Model", "MSE", "R2"]],
    gbr_results[["Model", "MSE", "R2"]],
    stack_results[["Model", "MSE", "R2"]]
], ignore_index=True)

# Sort by R²
final_table = final_table.sort_values(by="R2", ascending=False)

# Save final table
final_table.to_csv("final_model_comparison.csv", index=False)

# Print final ranking
print("\nFinal Model Ranking (by R²)")
print("-" * 40)
print(final_table)

# Plot comparison
plt.figure(figsize=(10, 6))
plt.barh(final_table["Model"], final_table["R2"])
plt.xlabel("R² Score")
plt.title("Final Model Comparison")
plt.gca().invert_yaxis()

plt.savefig("final_model_comparison.png", bbox_inches="tight")
plt.close()


In [None]:
import zipfile
import glob
zip_name = "student_performance_project_outputs.zip"

# File types you want to include
file_types = ["*.csv", "*.png", "*.pkl"]

files_to_zip = []

# Collect all result files
for file_type in file_types:
    files_to_zip.extend(glob.glob(file_type))

# Create zip
with zipfile.ZipFile(zip_name, "w", zipfile.ZIP_DEFLATED) as zipf:
    for file in files_to_zip:
        zipf.write(file, arcname=os.path.basename(file))

print("✅ ZIP file created:", zip_name)
print("✅ Files included:", len(files_to_zip))


# Graphs and analysis

In [None]:
df_best = pd.read_csv("stacking_predictions.csv")  # or best one from your result

plt.figure(figsize=(6,6))
plt.scatter(df_best["Actual"], df_best["Sklearn_Stacking"], alpha=0.6)
plt.plot([df_best["Actual"].min(), df_best["Actual"].max()],
         [df_best["Actual"].min(), df_best["Actual"].max()],
         linestyle="--")

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted (Best Model)")
plt.show() 
plt.savefig("actual_vs_predicted_best_model.png", bbox_inches="tight")
plt.close()

In [None]:
# Final Model Comparison Plot

final_results = pd.read_csv("final_model_comparison.csv")

plt.figure(figsize=(10,5))
plt.bar(final_results["Model"], final_results["R2"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("R² Score")
plt.title("Model Performance Comparison")
plt.show()
plt.savefig("model_comparison_chart.png", bbox_inches="tight")
plt.close()


In [None]:
# Error distribution for best model

df_best["Error"] = df_best["Actual"] - df_best["Sklearn_Stacking"]

plt.figure()
plt.hist(df_best["Error"], bins=30, edgecolor="black")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.title("Prediction Error Distribution")
plt.show()

plt.savefig("error_distribution_best_model.png", bbox_inches="tight")
plt.close()


In [None]:
# Model Summary Table

final_table = pd.read_csv("final_model_comparison.csv")

final_table["Rank"] = final_table["R2"].rank(ascending=False).astype(int)
final_table = final_table.sort_values("Rank")

final_table.to_csv("final_model_summary_with_rank.csv", index=False)

final_table
