In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('merged_df.csv', index_col=None)

# remove outliers
# remove absence_range, finalResult, 
df = df.drop(['absences_range', 'finalResult'], axis=1)
df = df[df['absences'] < 20]

In [None]:
#  transform all non object to object type
df[['Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel','freetime', 'goout', 
    'Dalc', 'Walc', 'health']] = df[['Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel','freetime', 'goout', 'Dalc', 'Walc', 'health']].astype('object') 

columns_cat = df[['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'G1', 'G2', 'G3',
       'academicGrade']]

continue_cols = df[['age','absences', 'G1', 'G2', 'G3']]

In [4]:
def evaluate_models(X_train, X_test, y_train, y_test):
    # Create preprocessing pipeline
    preprocessing_pipeline = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), make_column_selector(dtype_include=np.number)),
            ('cat', OneHotEncoder(), make_column_selector(dtype_include=object))
        ])

    # Create pipelines for each model
    knn_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing_pipeline),
        ('regressor', KNeighborsRegressor())
    ])

    dt_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing_pipeline),
        ('regressor', DecisionTreeRegressor())
    ])

    # Define hyperparameter search spaces for each pipeline
    knn_params = {'regressor__n_neighbors': [3, 5, 7, 9, 11],
                  'regressor__weights': ['uniform', 'distance'],
                  'regressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
                  'regressor__leaf_size': [10, 20, 30, 40, 50],
                  'regressor__p': [1, 2]}

    dt_params = {'regressor__criterion': ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'],
                 'regressor__splitter': ['best', 'random'],
                 'regressor__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                 'regressor__min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                 'regressor__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

    # Create a dictionary containing all the pipelines and their corresponding hyperparameter search spaces
    pipelines = {
        'knn': (knn_pipeline, knn_params),
        'dt': (dt_pipeline, dt_params)
    }

    # Create lists to store the pipeline names, MSE values, and R-squared values
    pipeline_names = []
    mse_values = []
    r2_values = []

    # Perform grid search for each pipeline
    for pipeline_name, (pipeline, param_grid) in pipelines.items():
        print(f"Performing grid search for pipeline: {pipeline_name}")
        grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        y_pred = grid_search.predict(X_test)

        # Compute metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Add pipeline name, MSE value, and R-squared value to the lists
        pipeline_names.append(pipeline_name)
        mse_values.append(mse)
        r2_values.append(r2)

        print(f"Best parameters for {pipeline_name}: {grid_search.best_params_}")
        print(f"Mean squared error: {mse:.3f}")
        print(f"R-squared: {r2:.3f}")
