In [130]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)

# imports
from common.utils import get_data, get_preprocessor
from common.custom_linear_regression import *

import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, precision_score, recall_score, accuracy_score

In [131]:
data = get_data()
X = data.drop(columns=["Target"])
y = data["Target"]

data_numeric = data.select_dtypes(include=["number"])
X_numeric = data_numeric.drop("Admission grade", axis=1)
y_numeric = data_numeric["Admission grade"]

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

kfold = KFold(n_splits=3, shuffle=True, random_state=6)

In [132]:
def calcualte_using_kfold(pipeline, kfold, X, y, is_target_numeric=False):
    results = []
    
    for train_indices, test_indices in kfold.split(X, y):
        X_train = X.iloc[train_indices]
        y_train = y.iloc[train_indices]
        
        X_val = X.iloc[test_indices]
        y_val = y.iloc[test_indices]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        
        if is_target_numeric:
            results.append([
                mean_squared_error(y_val, y_pred),
                mean_absolute_error(y_val, y_pred),
                r2_score(y_val, y_pred)
            ])
        else:
            results.append([
                accuracy_score(y_val, y_pred),
                precision_score(y_val, y_pred, average="weighted"),
                recall_score(y_val, y_pred, average="weighted"),
            ])
        
    return results

In [133]:
model = RandomForestClassifier(random_state=6, n_estimators=300)

pipeline = Pipeline([
    ("preprocessing", get_preprocessor(numerical_column_names, categorical_column_names)),
    ("classifier", model)
])

close_form_rows = calcualte_using_kfold(pipeline, kfold, X, y)
df_close_form = pd.DataFrame(close_form_rows, columns=["Accuracy", "Precision", "Recall"])
df_close_form

Unnamed: 0,Accuracy,Precision,Recall
0,0.776271,0.765799,0.776271
1,0.757966,0.734557,0.757966
2,0.783582,0.77352,0.783582


In [134]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LinearRegressionClosedForm())
])

close_form_rows = calcualte_using_kfold(pipeline, kfold, X_numeric, y_numeric, is_target_numeric=True)
df_close_form = pd.DataFrame(close_form_rows, columns=["MSE", "MAE", "R2"])
df_close_form

Unnamed: 0,MSE,MAE,R2
0,136.073072,8.250195,0.334383
1,142.770634,8.433683,0.354082
2,127.497759,7.973096,0.372298


In [135]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LinearRegressionGradientDescent())
])
gradient_descent_rows = calcualte_using_kfold(pipeline, kfold, X_numeric, y_numeric, is_target_numeric=True)
df_gradeint_descent = pd.DataFrame(gradient_descent_rows, columns=["MSE", "MAE", "R2"])
df_gradeint_descent

Unnamed: 0,MSE,MAE,R2
0,135.900112,8.242535,0.335229
1,142.840403,8.435018,0.353766
2,127.325512,7.966719,0.373146
