In [None]:
# Dont forget to create a virtual environment to avoid conflicts with other packages
%pip install pandas seaborn numpy matplotlib scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


In [None]:
from sklearn.metrics import mean_absolute_error


def plot_results(y_true, y_pred, color=None):
    if color is None:
        color = np.abs(y_true - y_pred)
    fig, ax = plt.subplots()
    sc = plt.scatter(y_true, y_pred, c=color, cmap="viridis")
    cb = fig.colorbar(sc)
    cb.set_label("Error")

    plt.plot([0, 1], [0, 1], "g--")
    plt.xlabel("true")
    plt.ylabel("pred")
    plt.tight_layout()


def check_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    print(f"MAE (TRAIN): {mean_absolute_error(model.predict(X_train), y_train)}")
    print(f"MAE (TEST): {mean_absolute_error(predictions, y_test)}")

    plot_results(
        y_test, predictions, color=X_test["ERROR"] if "ERROR" in X_test else None
    )

In [None]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

df_train['CODE_QUALITY'] = np.mean(df_train[['REVIEWER_1','REVIEWER_2']], axis=1)
df_test['CODE_QUALITY'] = np.mean(df_test[['REVIEWER_1','REVIEWER_2']], axis=1)


In [None]:
from sklearn.model_selection import train_test_split

X_train = df_train.drop(['CODE_QUALITY'], axis=1)
X_test = df_test.drop(['CODE_QUALITY'], axis=1)

y_train = df_train['CODE_QUALITY']
y_test = df_test['CODE_QUALITY']


In [None]:
from sklearn.linear_model import LinearRegression

local_X_train, local_X_test = X_train.copy(), X_test.copy()
local_y_train, local_y_test = y_train.copy(), y_test.copy()

# Normalize
local_X_train = local_X_train.select_dtypes(include=np.number).apply(lambda x: (x - x.mean()) / x.std())
local_X_test = local_X_test.select_dtypes(include=np.number).apply(lambda x: (x - x.mean()) / x.std())

# One hot encoding
local_X_train = pd.get_dummies(local_X_train)
local_X_test = pd.get_dummies(local_X_test)

model = LinearRegression()
check_model(model, local_X_train, local_X_test, local_y_train, local_y_test)

joblib.dump(model, 'linear_regression.joblib');


In [None]:
from sklearn.neighbors import KNeighborsRegressor

local_X_train, local_X_test = X_train.copy(), X_test.copy()
local_y_train, local_y_test = y_train.copy(), y_test.copy()

# Normalize
local_X_train = local_X_train.select_dtypes(include=np.number).apply(lambda x: (x - x.mean()) / x.std())
local_X_test = local_X_test.select_dtypes(include=np.number).apply(lambda x: (x - x.mean()) / x.std())

# One hot encoding
local_X_train = pd.get_dummies(local_X_train)
local_X_test = pd.get_dummies(local_X_test)

model = KNeighborsRegressor()
check_model(model, local_X_train, local_X_test, local_y_train, local_y_test)

joblib.dump(model, 'knn_regression.joblib');

In [None]:
from sklearn.tree import DecisionTreeRegressor

local_X_train, local_X_test = X_train.copy(), X_test.copy()
local_y_train, local_y_test = y_train.copy(), y_test.copy()

# Normalize
local_X_train = local_X_train.select_dtypes(include=np.number).apply(lambda x: (x - x.mean()) / x.std())
local_X_test = local_X_test.select_dtypes(include=np.number).apply(lambda x: (x - x.mean()) / x.std())

# One hot encoding
local_X_train = pd.get_dummies(local_X_train)
local_X_test = pd.get_dummies(local_X_test)

model = DecisionTreeRegressor()
check_model(model, local_X_train, local_X_test, local_y_train, local_y_test)

joblib.dump(model, 'decision_tree_regression.joblib');