In [23]:
import pandas as pd
from xgboost import XGBRegressor, DMatrix, cv
from pathlib import Path
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
import numpy as np
import warnings
import time
import sys
import joblib
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

interim_path = Path("../data/interim")
models_path = Path("../models")

### Load 0.2/0.8 data. Train model.

In [2]:
X_train, y_train = pd.read_parquet(interim_path / "train_x1.parquet"), pd.read_parquet(interim_path / "train_y1.parquet")
X_test, y_test = pd.read_parquet(interim_path / "test_x1.parquet"), pd.read_parquet(interim_path / "test_y1.parquet")

y_train, y_test = y_train.rating, y_test.rating

data_dmatrix = DMatrix(data=X_train, label=y_train)

In [3]:
grid = GridSearchCV(
            estimator=XGBRegressor(device="gpu"),
            param_grid={"learning_rate": (0.1, 0.2, 0.15),
                        "max_depth": [ 3, 5, 12],
                        "n_estimators":[200, 500, 800]},
            cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=1)

In [4]:
# progress bar implementation from https://datascience.stackexchange.com/questions/114060/progress-bar-for-gridsearchcv
def fit(model, *args, **kwargs):
    class BarStdout:
        def write(self, text):
            if "totalling" in text and "fits" in text:
                self.bar_size = int(text.split("totalling")[1].split("fits")[0][1:-1])
                self.bar = tqdm(range(self.bar_size))
                self.count = 0
                return
            if "CV" in text and hasattr(self,"bar"):
                self.count += 1
                self.bar.update(n=self.count-self.bar.n)
                if self.count%(self.bar_size//10)==0:
                    time.sleep(0.1)
        def flush(self, text=None):
            pass
    default_stdout= sys.stdout
    sys.stdout = BarStdout()
    model.verbose = 10
    model.fit(*args, **kwargs)
    sys.stdout = default_stdout
    return model

In [5]:
grid_result = fit(grid, X_train, y_train)

  0%|          | 0/81 [00:00<?, ?it/s]

In [6]:
np.sqrt(mean_squared_error(grid_result.predict(X_test), y_test))

0.18986108188594247

In [7]:
grid_result.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}

### Load disjoint data. Train model.

In [8]:
X_train2, y_train2 = pd.read_parquet(interim_path / "train_x2.parquet"), pd.read_parquet(interim_path / "train_y2.parquet")
X_test2, y_test2 = pd.read_parquet(interim_path / "test_x2.parquet"), pd.read_parquet(interim_path / "test_y2.parquet")

y_train2, y_test2 = y_train2.rating, y_test2.rating

data_dmatrix2 = DMatrix(data=X_train2, label=y_train2)

In [9]:
grid2 = GridSearchCV(
            estimator=XGBRegressor(device="gpu"),
            param_grid={"learning_rate": (0.1, 0.2, 0.15),
                        "max_depth": [ 3, 10, 20],
                        "n_estimators":[200, 500, 1000]},
            cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=1)

In [10]:
grid_result2 = fit(grid2, X_train2, y_train2)

  0%|          | 0/81 [00:00<?, ?it/s]

In [11]:
np.sqrt(mean_squared_error(grid_result2.predict(X_test2), y_test2))

0.2152094502537741

In [13]:
grid_result2.best_params_

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 1000}

### Save both models

In [16]:
joblib.dump(grid_result, models_path / 'model_2080.pkl')
joblib.dump(grid_result2, models_path / 'model_disjoint.pkl')

['D:\\Productivity\\Studying\\PMLDL_A2\\models\\model_disjoint.pkl']