In [None]:
!nvidia-smi

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [2]:
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

# Import RAPIDS/CUDF
import cudf, cuml, cupy
from cudf.core.dataframe import DataFrame as cu_df
from cudf.core.series import Series as cu_series
print('RAPIDS',cudf.__version__)

RAPIDS 24.04.01


In [3]:
# Import important libs
import numpy as np
import pandas as pd
import os
from functools import partial
import scipy as sp

# Import CV and Metric
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

# Import Model
import xgboost as xgb

# Import random to seed and made reproducible
import random

In [5]:
SEED = 42
NUM_FOLDS = 10
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
SAMPLE_SUBMISSION = 'sample_submission.csv'

In [6]:
# 設定隨機數生成的種子，以確保在不同的執行環境中，隨機數的生成結果是一致的
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed=SEED)

In [7]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)

In [None]:
train.describe()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0
mean,1027.5,8.365175,0.527601,0.265058,2.398881,0.081856,16.955982,49.236868,0.996748,3.310569,0.641308,10.414972,5.720817
std,593.660397,1.70539,0.173164,0.188267,0.858824,0.023729,10.00971,32.961141,0.001827,0.142321,0.137942,1.028825,0.853146
min,0.0,5.0,0.18,0.0,1.2,0.012,1.0,7.0,0.99007,2.74,0.39,8.7,3.0
25%,513.75,7.2,0.39,0.09,1.9,0.071,8.0,22.0,0.9956,3.2,0.55,9.5,5.0
50%,1027.5,7.95,0.52,0.25,2.2,0.079,16.0,44.0,0.9967,3.31,0.61,10.1,6.0
75%,1541.25,9.2,0.64,0.42,2.6,0.09,24.0,65.0,0.9978,3.39,0.72,11.0,6.0
max,2055.0,15.9,1.58,0.76,14.0,0.414,68.0,289.0,1.00369,3.78,1.95,14.0,8.0


In [8]:
X = train.drop(columns=['quality', 'Id']) # train特徵
features = X.columns
X_test = test.drop(columns=['Id'])  # test特徵
y = train.quality # train目標變量

In [9]:
X.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0
mean,8.365175,0.527601,0.265058,2.398881,0.081856,16.955982,49.236868,0.996748,3.310569,0.641308,10.414972
std,1.70539,0.173164,0.188267,0.858824,0.023729,10.00971,32.961141,0.001827,0.142321,0.137942,1.028825
min,5.0,0.18,0.0,1.2,0.012,1.0,7.0,0.99007,2.74,0.39,8.7
25%,7.2,0.39,0.09,1.9,0.071,8.0,22.0,0.9956,3.2,0.55,9.5
50%,7.95,0.52,0.25,2.2,0.079,16.0,44.0,0.9967,3.31,0.61,10.1
75%,9.2,0.64,0.42,2.6,0.09,24.0,65.0,0.9978,3.39,0.72,11.0
max,15.9,1.58,0.76,14.0,0.414,68.0,289.0,1.00369,3.78,1.95,14.0


In [10]:
X_test.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,1372.0,1372.0,1372.0,1372.0,1372.0,1372.0,1372.0,1372.0,1372.0,1372.0,1372.0
mean,8.369606,0.524311,0.270569,2.415926,0.081307,17.162536,49.089286,0.99667,3.312223,0.647267,10.473081
std,1.664694,0.176942,0.185731,0.921155,0.026566,9.974082,32.803721,0.001822,0.141405,0.134792,1.02717
min,4.9,0.12,0.0,1.3,0.012,1.0,8.0,0.99007,2.74,0.39,8.8
25%,7.1,0.39,0.11,2.0,0.07,8.0,23.0,0.99538,3.21,0.55,9.6
50%,7.9,0.5,0.26,2.2,0.078,16.0,43.0,0.9966,3.31,0.62,10.2
75%,9.1,0.635,0.42,2.5,0.087,24.0,66.0,0.9978,3.39,0.73,11.2
max,15.6,1.33,0.76,8.3,0.467,68.0,165.0,1.0032,4.01,1.62,14.0


In [11]:
#  I posted some references on the discussion of where I got this snippet of code, but right now I dont remember exactly the notebook (btw thanks for the implementation!)
# 優化回歸模型的預測結果
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8

        ll = cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p.astype('int')

    def coefficients(self):
        return self.coef_['x']

In [12]:
# 用來訓練XGBoost 模型，並使用交叉驗證來評估其性能。
def cross_valid(model, train, target, test, num_folds=10, random_state=42):

    train_oof = np.zeros((len(train)))  # 儲存訓練集預測結果
    test_preds = 0  # 儲存測試集預測結果

    kf = StratifiedKFold(n_splits=num_folds, random_state=SEED, shuffle=True)
    scores = []

    params = {
        'random_state': SEED,
        'objective': 'reg:squarederror',
        'nthread': -1,
        'tree_method': 'gpu_hist',

        'lambda': 5.619263832415134e-06,
        'alpha': 0.004114579891733313,
        'max_depth': 5,
        'eta': 0.04855597008155139,
        'gamma': 4.4150816096917164e-07,
        'min_child_weight': 20,
        'subsample': 0.4923174680251535,
        'colsample_bytree': 0.7120320651725672,
        'max_delta_step': 44.336096164825314,
        'num_boost_round': 1000
     }

    num_rounds = params['num_boost_round']

    xgb_train_preds = np.zeros(len(train.index), )

    coef = []

    for f, (train_ind, val_ind) in enumerate(kf.split(train, target)):

        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]

        train_target, val_target = target[train_ind], target[val_ind]

        xgb_x_train = pd.DataFrame(train_df)
        xgb_x_valid = pd.DataFrame(val_df)

        xgb_x_train_cudf = cu_df(xgb_x_train)
        y_train_cudf = cu_series(train_target)
        xgb_x_valid_cudf = cu_df(xgb_x_valid)
        y_valid_cudf = cu_series(val_target)

        trn_data = xgb.DMatrix(xgb_x_train_cudf, label=y_train_cudf)
        val_data = xgb.DMatrix(xgb_x_valid_cudf, label=y_valid_cudf)

        model = xgb.train(params,
                          trn_data,
                          num_rounds,
                          evals=[(val_data, "val_data")],
                          verbose_eval=False,
                          early_stopping_rounds=50
                          )

        xgb_valid_preds = model.predict(xgb.DMatrix(xgb_x_valid_cudf), iteration_range=(0, model.best_iteration))

        optR = OptimizedRounder()
        optR.fit(xgb_valid_preds, val_target)

        temp_oof = optR.predict(xgb_valid_preds, optR.coefficients())

        train_oof[val_ind] = temp_oof

        test_oof_preds = model.predict(xgb.DMatrix(test[columns]), iteration_range=(0, model.best_iteration))

        test_oof_preds = optR.predict(test_oof_preds, optR.coefficients())

        coef.append(optR.coefficients())

        print(optR.coefficients())

        test_preds += test_oof_preds / num_folds

        scores.append(cohen_kappa_score(val_target, temp_oof, weights='quadratic'))

        print("Fold ", f, " ", cohen_kappa_score(val_target, temp_oof, weights="quadratic"))

    print("Mean Kappa Score: ", np.mean(scores))
    print("Kappa Score OOF: ", cohen_kappa_score(y, train_oof, weights='quadratic'))

    return train_oof, test_preds, np.mean(scores), coef

In [13]:
# Here I can select which features I want to use
columns = features


# If you want to do ensemble is good to save the oof train / preds
train_oof_1, test_preds_1, score_oof_1, coef = cross_valid(None, X, y, X_test, num_folds=NUM_FOLDS, random_state=SEED)

[3.62694799 4.65836151 5.44878298 6.35276907 7.31117419]
Fold  0   0.5381165919282511
[3.52016539 4.46902873 5.82311603 6.07996234 7.60972437]
Fold  1   0.6404772482071459
[3.518144 4.572828 5.656332 6.19736  7.54938 ]
Fold  2   0.5543395265970985
[3.52249831 4.70788099 5.64122659 6.1237662  7.48674675]
Fold  3   0.6337527352297592
[3.59823124 4.52637845 5.6111593  6.10099922 7.67793807]
Fold  4   0.5207920077902406
[3.62875136 4.40749099 5.74333864 5.96876567 7.4739354 ]
Fold  5   0.6112033821531029
[3.58463435 4.74535246 5.47706843 6.25457604 7.52790132]
Fold  6   0.5216597341990464
[3.60600315 4.53348386 5.55757233 6.39421287 7.51761915]
Fold  7   0.6074695859041062
[3.56600236 4.58010848 5.60628505 6.29767433 7.53695623]
Fold  8   0.6316716575656156
[3.66414877 4.44857177 5.69529068 6.03880868 7.44358562]
Fold  9   0.5884482474905381
Mean Kappa Score:  0.5847930717064903
Kappa Score OOF:  0.5874918530053163


In [14]:
np.save('train_oof_xgb.npy', train_oof_1)
np.save('test_preds_xgb.npy', test_preds_1)

In [15]:
sample_submission['quality'] = test_preds_1.round().astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6
...,...,...
1367,3423,5
1368,3424,7
1369,3425,5
1370,3426,5
