In [2]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
# from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

# import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Enginering features
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]
Y = train['quality'] - 3

test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]

# XGBoost Modeling like Crazy

In [14]:
XGB_cv_scores, XGB_imp = list(), list()
preds = list()

## Running 5 times CV
for i in range(100):
    
    print(i)
    ## Building RF model
    XGB_md = XGBClassifier(tree_method = 'hist',
                           colsample_bytree = 0.7, 
                           gamma = 5.5, 
                           learning_rate = 0.031, 
                           max_depth = 5, 
                           min_child_weight = 68, 
                           n_estimators = 8800, 
                           subsample = 0.41, 
                           random_state = i).fit(X, Y)
    XGB_imp.append(XGB_md.feature_importances_)
        
    ## Predicting on X_test and test
    XGB_pred_1 = XGB_md.predict(X)
    XGB_pred_2 = XGB_md.predict(test_md)
        
    ## Computing roc-auc score
    XGB_cv_scores.append(cohen_kappa_score(Y, XGB_pred_1, weights = 'quadratic'))
    preds.append(XGB_pred_2)

XGB_cv_score = np.mean(XGB_cv_scores)    
print('The average roc-auc score over 5-folds (run 5 times) is:', XGB_cv_score)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
The average roc-auc score over 5-folds (run 5 times) is: 0.5229620617745152


In [10]:
preds

[array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2, 3, 2, ..., 2, 2, 2]),
 array([2,

In [11]:
pred_out = pd.DataFrame(preds)
pred_out.loc[:, 0]

0     2
1     2
2     2
3     2
4     2
5     2
6     2
7     2
8     2
9     2
10    2
11    2
12    2
13    2
14    2
15    2
16    2
17    2
18    2
19    2
20    2
21    2
22    2
23    2
24    2
25    2
26    2
27    2
28    2
29    2
30    2
31    2
32    2
33    2
34    2
35    2
36    2
37    2
38    2
39    2
40    2
41    2
42    2
43    2
44    2
45    2
46    2
47    2
48    2
49    2
Name: 0, dtype: int64

In [17]:
XGB_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ] + 3

submission['quality'] = XGB_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6


In [18]:
submission['quality'].value_counts()

5    635
6    565
7    172
Name: quality, dtype: int64

In [13]:
submission['quality'].value_counts()

5    635
6    565
7    172
Name: quality, dtype: int64

In [8]:
submission['quality'].value_counts()

5    640
6    564
7    168
Name: quality, dtype: int64

In [19]:
submission.to_csv('XGB_baseline_full_FE_11.csv', index = False)