In [None]:
import matplotlib as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

# Modeling imports
from xgboost  import XGBClassifier
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold
import optuna
from optuna.samplers import TPESampler
import numpy as np

path = "/Users/tcaron/Documents/Python Scripts/KaggleS3E5/data/"

train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")
origin = pd.read_csv(path+"WineQT.csv")

In [None]:
def SelfSplitTrain(df,test_size=0.2):
    X = df.drop(columns=["Id","quality"])
    y = df[["quality"]].values
    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=test_size,random_state=42)
    return (X_train, X_val, y_train, y_val)

In [None]:
def pca(train,test,columns=["pH","fixed acidity"]):
    target = "quality"
    df_trn = train.copy(deep = True)
    df_tst = test.copy(deep = True)
    df_trn[target] = df_trn[target].map({3:0,
                    4:1,
                    5:2,
                    6:3,
                    7:4,
                    8:5})
    pca_ = PCA(n_components=1 ,whiten= False)
    df_trn["pca_1"] = pca_.fit_transform(df_trn[columns])
    df_tst["pca_1"] = pca_.fit_transform(df_tst[columns])
    
    for cols in columns:
        for df in [df_trn,df_tst]:
            df.drop(cols, axis =1, inplace = True)
    return (df_trn,df_tst)

def sortie_prep(y_pred):
    return y_pred +3

In [None]:
Id = test[["Id"]]
train = pd.concat([train,origin],ignore_index=True)
train,test= pca(train, test)
X_test = test.drop(columns = "Id")
X_train,X_val,y_train,y_val = SelfSplitTrain(train)

def objective(trial):
    target_clases = train["quality"].value_counts()
    n_classes = target_clases.nunique()
    params_optuna = {
            'max_depth': trial.suggest_int('max_depth', 1, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0.01, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 1.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 1.0),
            'objective' : " multi:softmax",
            'num_class': n_classes,
        }
    
    n=trial.suggest_int('n_cv', 3, 10)
    cv = StratifiedKFold(n,shuffle=True, random_state=42)
    fold_scores = []
    model = XGBClassifier(**params_optuna)
    model.fit(X_train,
              y_train,
              eval_set= [(X_val,y_val)],
              early_stopping_rounds = 50,
              verbose=500)

    pred_val = model.predict(X_val)

    score = cohen_kappa_score(y_val,pred_val, weights='quadratic')
    fold_scores.append(score)
    return np.mean(fold_scores)


study = optuna.create_study(direction='maximize', sampler = TPESampler())
study.optimize(func=objective, n_trials=500)
print(study.best_params)

In [None]:
params_xgb = {'max_depth': 9,
              'learning_rate': 0.10170219296989287, 
              'n_estimators': 939,
              'min_child_weight': 4,
              'gamma': 0.2873681880046375,
              'subsample': 0.6582602969527163,
              'colsample_bytree': 0.7214980206445512,
              'reg_alpha': 0.33596370591990027,
              'reg_lambda': 0.5061368887005409,
              'n_cv': 5}

def objective2(trial):
    target_clases = train["quality"].value_counts()
    n_classes = target_clases.nunique()
    params_optuna = {
            'lambda_l1': trial.suggest_float('lambda_l1', 0, 1),
             'lambda_l2': trial.suggest_float('lambda_l2', 1, 10),
             'num_leaves': trial.suggest_int('num_leaves', 40, 60),
             'feature_fraction': trial.suggest_float('feature_fraction', 0, 1),
             'bagging_fraction': trial.suggest_float('bagging_fraction', 0, 1),
             'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
             'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
             'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
             'max_depth': trial.suggest_int('max_depth', 1, 20),
             'num_iterations':trial.suggest_int('num_iterations', 100, 10000),
             'objective' : "multiclass",
             'metric' :'multi_logloss',
        }
    
    n=trial.suggest_int('n_cv', 3, 10)
    cv = StratifiedKFold(n,shuffle=True, random_state=42)
    fold_scores = []
    model = LGBMClassifier(**params_optuna)
    model.fit(X_train,
              y_train,
              eval_set= [(X_val,y_val)],
              early_stopping_rounds = 50,
              verbose=500)

    pred_val = model.predict(X_val)

    score = cohen_kappa_score(y_val,pred_val, weights='quadratic')
    fold_scores.append(score)
    return np.mean(fold_scores)


study = optuna.create_study(direction='maximize', sampler = TPESampler())
study.optimize(func=objective2, n_trials=500)
print(study.best_params)

In [None]:
params_lgbm= {'lambda_l1': 0.06032958694472297,
              'lambda_l2': 9.468684954887555,
              'num_leaves': 49,
              'feature_fraction': 0.42024961425671364,
              'bagging_fraction': 0.6392661414098254,
              'bagging_freq': 9,
              'min_child_samples': 43,
              'min_data_in_leaf': 14,
              'max_depth': 8,
              'num_iterations': 3723,
              'n_cv': 9}

In [None]:
train.drop(columns='Id',inplace=True)

In [None]:
test.drop(columns='Id',inplace=True)

In [None]:
features = list(X_train.columns)
TARGET = 'quality'
k=10
cv = StratifiedKFold(k, shuffle=True, random_state=42)
fold_scores = []
test_preds = []
oof_preds = []
oof_true = []
for i, (train_idx,val_idx) in enumerate(cv.split(train[features],
                                                 train[TARGET])):
    
    X_train = train.loc[train_idx, features]
    y_train = train.loc[train_idx, TARGET]
    X_val = train.loc[val_idx, features]
    y_val = train.loc[val_idx, TARGET]
    
    #**************XGB***************
    model1 = XGBClassifier(**params_xgb)
    model1.fit(X_train,
             y_train,
             eval_set= [(X_val,y_val)],
             early_stopping_rounds = 200,
             verbose=200)
    
    pred_val1 = model1.predict(X_val)
    
    score1 = cohen_kappa_score(y_val,pred_val1, weights='quadratic')
    #discard the predictions of poor performing models
    if score1>0.36:
        test_preds.append(model1.predict(test[features]))
        fold_scores.append(score1)
    
    #************** Light GBM **************
    model2 = LGBMClassifier(**params_lgbm)
    model2.fit(X_train,
             y_train,
             eval_set= [(X_val,y_val)],
             early_stopping_rounds = 200,
             verbose=200)
    
    pred_val2 = model2.predict(X_val)
    
    score2 = cohen_kappa_score(y_val,pred_val2, weights='quadratic')
    #discard the predictions of poor performing models
    if score2>0.36:
        test_preds.append(model2.predict(test[features]))
        fold_scores.append(score2)
    
    oof_preds.extend(np.mean([pred_val1,pred_val2],axis=0))
    oof_true.extend(y_val)
    print(f'=== Fold {i} Cohen Kappa Score {np.mean([score1,score2])} ===')

print(f'=== Average Cohen Kappa Score {np.mean(fold_scores)} ===')

In [None]:
class OptunaRounder:

    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred
        self.labels = np.unique(y_true)

    def __call__(self, trial):
        thresholds = []
        for i in range(len(self.labels) - 1):
            low = max(thresholds) if i > 0 else min(self.labels)
            high = max(self.labels)
            t = trial.suggest_float(f't{i}', low, high)
            thresholds.append(t)
        try:
            opt_y_pred = self.adjust(self.y_pred, thresholds)
        except: return 0
        return cohen_kappa_score(self.y_true, opt_y_pred, weights='quadratic')

    def adjust(self, y_pred, thresholds):
        opt_y_pred = pd.cut(y_pred,
                            [-np.inf] + thresholds + [np.inf],
                            labels=self.labels)
        return opt_y_pred

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING) 
objective = OptunaRounder(oof_true, oof_preds)
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, timeout=100)

In [None]:
best_thresholds = sorted(study.best_params.values())
print(f'Optimized thresholds: {best_thresholds}')
test_preds = np.array(test_preds).mean(axis=0) 
opt_test_preds = objective.adjust(test_preds, best_thresholds).astype(int) +3
Id["quality"]=opt_test_preds
print(Id.head(3))

In [None]:
Id['quality'].value_counts()

In [None]:
Id.to_csv("sample_submission.csv",index=False)