In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import RepeatedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

df = pd.read_csv("/Users/emmali/PyCharmMiscProject/final_real.csv")
TARGET = "stars"
X = df.drop(columns=[TARGET])
y = df[TARGET]

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

preproc_lr_knn = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ],
    remainder="drop"
)

preproc_tree = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ],
    remainder="drop"
)

models = {
    "Linear": Pipeline([
        ("prep", preproc_lr_knn),
        ("model", LinearRegression())
    ]),
    "KNN(10)": Pipeline([
        ("prep", preproc_lr_knn),
        ("model", KNeighborsRegressor(n_neighbors=10))
    ]),
    "RandomForest": Pipeline([
        ("prep", preproc_tree),
        ("model", RandomForestRegressor(
            n_estimators=500, random_state=42, n_jobs=-1
        ))
    ]),
    "XGBoost": Pipeline([
        ("prep", preproc_tree),
        ("model", XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=42,
            n_jobs=-1,
            tree_method="hist",
            eval_metric="rmse"
        ))
    ])
}

cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

def acc_score(y_true, y_pred, tol=0.25):
    return np.mean(np.abs(y_true - y_pred) <= tol)

def cv_report(name, pipe, X, y, cv):
    y_true_all, y_pred_all = [], []

    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = pipe.fit(X_train, y_train)
        preds = model.predict(X_test)

        y_true_all.extend(y_test)
        y_pred_all.extend(preds)

    y_true_all = np.array(y_true_all)
    y_pred_all = np.array(y_pred_all)
    rmse = np.sqrt(mean_squared_error(y_true_all, y_pred_all))
    r2   = r2_score(y_true_all, y_pred_all)
    acc50 = acc_score(y_true_all, y_pred_all, tol=0.5)

    return pd.Series({
        "RMSE": rmse,
        "R2": r2,
        "ACC(±0.5)": acc50
    }, name=name)

summary = pd.concat([cv_report(n, m, X, y, cv) for n, m in models.items()], axis=1).T
summary.sort_values("RMSE", inplace=True)
print(summary)


                  RMSE        R2  ACC(±0.5)
XGBoost       0.478320  0.641141   0.760432
RandomForest  0.480201  0.638313   0.757110
Linear        0.516503  0.581560   0.720296
KNN(10)       0.562266  0.504127   0.666251
