In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
# from lightgbm import LGBMClassifier, LGBMRegressor
# from xgboost import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Enginering features
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]
Y = train['quality'] 

test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]

# Optimal Rounder

In [3]:
from functools import partial
import numpy as np
import scipy as sp

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8

        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p

    def coefficients(self):
        return self.coef_['x']

# Optuna Optimization

In [6]:
def objective(trial):
    
    ## Parameters to be evaluated
    param = dict(gamma = trial.suggest_float('gamma', 0.001, 100, log = True),
                 C = trial.suggest_float('C', 0.001, 100, log = True)
                 )

    
    scores = []

#     skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
        print(fold, end = ' ')
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train , y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.fit_transform(X_valid)
        
        model = SVR(**param)
        model.fit(X_train, y_train)

        preds_valid = model.predict(X_valid)
        
        optR = OptimizedRounder()
        optR.fit(preds_valid, y_valid)
        coef = optR.coefficients()
        preds_valid = optR.predict(preds_valid, coef).astype(int)
        
        score = cohen_kappa_score(y_valid,  preds_valid, weights = "quadratic")
        scores.append(score)
        
    return np.mean(scores)

In [7]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50, timeout = 3600)

[32m[I 2023-02-07 05:23:48,789][0m A new study created in memory with name: no-name-f47ef7a9-5ee6-423d-8a87-d8ce8d2a77fa[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:23:50,968][0m Trial 0 finished with value: 0.5323880393356217 and parameters: {'gamma': 0.005251583707631983, 'C': 44.366136196986396}. Best is trial 0 with value: 0.5323880393356217.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:23:53,100][0m Trial 1 finished with value: 0.5615095702383444 and parameters: {'gamma': 0.42024562480270605, 'C': 2.2711369345105425}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:23:55,078][0m Trial 2 finished with value: 0.3877953574569566 and parameters: {'gamma': 13.265553560082235, 'C': 0.006924694000439944}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:23:57,178][0m Trial 3 finished with value: 0.5387767064914922 and parameters: {'gamma': 0.019889009323938573, 'C': 4.8613890475466395}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:23:59,396][0m Trial 4 finished with value: 0.5087268395106057 and parameters: {'gamma': 0.4452231421625899, 'C': 0.015145620450498339}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:01,174][0m Trial 5 finished with value: 0.0 and parameters: {'gamma': 0.0026068971097006027, 'C': 0.001674515567833916}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:03,413][0m Trial 6 finished with value: 0.5353132736267053 and parameters: {'gamma': 0.002043907531065452, 'C': 2.770554501858864}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:05,651][0m Trial 7 finished with value: 0.5377031704138174 and parameters: {'gamma': 0.006829400395642031, 'C': 0.5964715220573397}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:07,647][0m Trial 8 finished with value: 0.37512514438301087 and parameters: {'gamma': 71.51491345252035, 'C': 0.44249416136359754}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:09,608][0m Trial 9 finished with value: 0.3789110002285571 and parameters: {'gamma': 37.31459408931413, 'C': 0.4365020273733838}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:11,730][0m Trial 10 finished with value: 0.552730235085923 and parameters: {'gamma': 0.35624492539446756, 'C': 0.0556489088942576}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:13,863][0m Trial 11 finished with value: 0.5525435404145778 and parameters: {'gamma': 0.46163636754804593, 'C': 0.06588990977901586}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:16,028][0m Trial 12 finished with value: 0.5411576645685316 and parameters: {'gamma': 0.10530777921899345, 'C': 0.06790582353706563}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:18,035][0m Trial 13 finished with value: 0.5069786218635336 and parameters: {'gamma': 2.4545920686420413, 'C': 0.07527970575905206}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:20,166][0m Trial 14 finished with value: 0.5510396249084565 and parameters: {'gamma': 0.08982631103210932, 'C': 7.194104542579919}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:22,710][0m Trial 15 finished with value: 0.4681795573627664 and parameters: {'gamma': 2.695354321330111, 'C': 65.94794557608202}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:24,848][0m Trial 16 finished with value: 0.5317997424097511 and parameters: {'gamma': 1.5533436317690894, 'C': 1.1228859470219785}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:27,050][0m Trial 17 finished with value: 0.5396273910007123 and parameters: {'gamma': 0.07972792186735836, 'C': 0.1430644412372276}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:29,146][0m Trial 18 finished with value: 0.5486643972405059 and parameters: {'gamma': 0.023241923117579, 'C': 1.5402925552910667}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:31,330][0m Trial 19 finished with value: 0.544665285107971 and parameters: {'gamma': 0.7804961264364222, 'C': 9.848159405993338}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:33,422][0m Trial 20 finished with value: 0.5567243183442983 and parameters: {'gamma': 0.22057180340917223, 'C': 0.19119271954759443}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:35,469][0m Trial 21 finished with value: 0.5446031345106611 and parameters: {'gamma': 0.18654228124057673, 'C': 0.18637876047809493}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:37,674][0m Trial 22 finished with value: 0.5366715019172215 and parameters: {'gamma': 0.2791168802730611, 'C': 0.024905307839819497}. Best is trial 1 with value: 0.5615095702383444.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:39,776][0m Trial 23 finished with value: 0.5626052989188715 and parameters: {'gamma': 0.8520490797131716, 'C': 0.1837129101657571}. Best is trial 23 with value: 0.5626052989188715.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:41,843][0m Trial 24 finished with value: 0.5456416240162024 and parameters: {'gamma': 1.0869951619987182, 'C': 0.9536491299896149}. Best is trial 23 with value: 0.5626052989188715.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:43,904][0m Trial 25 finished with value: 0.4835141094078835 and parameters: {'gamma': 5.152274028468974, 'C': 0.2304583681711223}. Best is trial 23 with value: 0.5626052989188715.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:46,004][0m Trial 26 finished with value: 0.5660708973431483 and parameters: {'gamma': 1.0542080231873707, 'C': 0.2771400815744156}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:48,107][0m Trial 27 finished with value: 0.45322431507757377 and parameters: {'gamma': 6.361032914771076, 'C': 2.193163535859739}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:50,217][0m Trial 28 finished with value: 0.5557720448766797 and parameters: {'gamma': 1.0276470883619806, 'C': 0.531535636355389}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:52,488][0m Trial 29 finished with value: 0.5408045349907533 and parameters: {'gamma': 0.654703407122767, 'C': 15.863854819018554}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:54,854][0m Trial 30 finished with value: 0.4556710036978318 and parameters: {'gamma': 1.8646869734553646, 'C': 38.59930857917014}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:56,956][0m Trial 31 finished with value: 0.5551471766394226 and parameters: {'gamma': 0.2814525930682202, 'C': 0.26958243240710816}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:24:59,119][0m Trial 32 finished with value: 0.5621645821526264 and parameters: {'gamma': 0.7464340943649804, 'C': 0.7923226909000245}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:01,244][0m Trial 33 finished with value: 0.5429263071585507 and parameters: {'gamma': 0.9372258491472305, 'C': 3.164217673058026}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:03,345][0m Trial 34 finished with value: 0.46248021874777134 and parameters: {'gamma': 5.086310993344702, 'C': 1.248903531622651}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:05,549][0m Trial 35 finished with value: 0.5569881387868312 and parameters: {'gamma': 0.504738268388753, 'C': 0.8489714904434914}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:07,675][0m Trial 36 finished with value: 0.4596645781310363 and parameters: {'gamma': 2.6717214857411165, 'C': 2.980670546160694}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:09,665][0m Trial 37 finished with value: 0.43393213650896734 and parameters: {'gamma': 10.528592892957082, 'C': 0.36362728745024964}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:11,790][0m Trial 38 finished with value: 0.5335780338204698 and parameters: {'gamma': 1.447269225259647, 'C': 0.6282737785558768}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:13,948][0m Trial 39 finished with value: 0.5417060385733811 and parameters: {'gamma': 0.1533450509918624, 'C': 1.673898630070168}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:16,054][0m Trial 40 finished with value: 0.5507717459580455 and parameters: {'gamma': 0.5141650714604057, 'C': 4.344362636240653}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:18,216][0m Trial 41 finished with value: 0.5599469493175194 and parameters: {'gamma': 0.5659240824896566, 'C': 0.9047383968748925}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:20,279][0m Trial 42 finished with value: 0.5607804991899986 and parameters: {'gamma': 0.7668223949730968, 'C': 0.7634292296300069}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:22,496][0m Trial 43 finished with value: 0.5645772100492813 and parameters: {'gamma': 0.992648090306339, 'C': 0.4264150028821251}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:24,587][0m Trial 44 finished with value: 0.5520625987642875 and parameters: {'gamma': 0.31599924118207884, 'C': 0.3650731185416263}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:26,676][0m Trial 45 finished with value: 0.5512869964331861 and parameters: {'gamma': 1.6041419682218034, 'C': 0.1277433425659228}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:28,785][0m Trial 46 finished with value: 0.49397093311533347 and parameters: {'gamma': 3.8442467409876486, 'C': 0.31512133654248103}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:30,827][0m Trial 47 finished with value: 0.5111080659546994 and parameters: {'gamma': 2.703943350746687, 'C': 0.43770296647500184}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:32,889][0m Trial 48 finished with value: 0.5559021101583037 and parameters: {'gamma': 1.2011031019690146, 'C': 0.10833093440576959}. Best is trial 26 with value: 0.5660708973431483.[0m


0 1 2 3 4 

[32m[I 2023-02-07 05:25:34,956][0m Trial 49 finished with value: 0.558608542602665 and parameters: {'gamma': 0.40872137142354886, 'C': 1.8303947620833592}. Best is trial 26 with value: 0.5660708973431483.[0m


# SVM Modeling

In [11]:
test_md.head()

Unnamed: 0,sulphate/density,alcohol_density,alcohol,sulphates
0,0.591491,9.775304,9.8,0.59
1,0.682827,9.9586,10.0,0.68
2,0.651472,9.47853,9.5,0.65
3,0.653398,11.63916,11.7,0.65
4,0.483111,12.717568,12.8,0.48


In [12]:
svm_cv_scores = list()
preds = list()

skf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
for train_ix, test_ix in skf.split(X, Y):
        
    ## Splitting the data 
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    test_md = scaler.fit_transform(test_md)
    
    ## Building the model
    svm_md = SVR(gamma = 1.0542080231873707, C = 0.2771400815744156).fit(X_train, Y_train)
    
    ## Predicting on X_test and test
    svm_pred_1 = svm_md.predict(X_test)
    svm_pred_2 = svm_md.predict(test_md)
        
    ## Applying Optimal Rounder (using abhishek approach)
    optR = OptimizedRounder()
    optR.fit(svm_pred_1, Y_test)
    coef = optR.coefficients()
    svm_pred_1 = optR.predict(svm_pred_1, coef).astype(int)
    svm_pred_2 = optR.predict(svm_pred_2, coef).astype(int)
        
    ## Computing roc-auc score
    svm_cv_scores.append(cohen_kappa_score(Y_test, svm_pred_1, weights = 'quadratic'))
    preds.append(svm_pred_2)

svm_cv_score = np.mean(svm_cv_scores)    
print('The average weighted quadratic kappa score over 5-folds is:', svm_cv_score)

The average weighted quadratic kappa score over 5-folds is: 0.5660708973431483


In [13]:
svm_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ]

submission['quality'] = svm_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,5
2,2058,5
3,2059,6
4,2060,6


In [14]:
submission['quality'].value_counts()

5    748
6    418
7    206
Name: quality, dtype: int64

In [15]:
submission.to_csv('SVM_Reg_FE_1.csv', index = False)