**Model Training for LGBM**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('model_building_dataset1.csv')
data.head()

Unnamed: 0,label,word_len_review,string_len_review,aaa,abc,ability,abit,able,abroad,absolute,...,young,younger,yr,yuck,yum,yummy,yunque,zero,zone,zoo
0,1,0.04158,0.040797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1,0.126299,0.122241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.109148,0.102772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.04262,0.041317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.095634,0.091922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature Engineering

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [5]:
model = SelectKBest(score_func=chi2, k='all')
fit = model.fit(data.iloc[:,1:], data.iloc[:,0])
scores = np.around(fit.scores_, 3)
scores

array([7.522, 5.746, 0.392, ..., 0.911, 0.639, 1.64 ])

In [6]:
idx_cols = list(np.where(scores>0.5)[0])
idx_cols = [x+1 for x in idx_cols]
idx_cols[:5]

[1, 2, 6, 9, 10]

In [7]:
data.shape

(20491, 5003)

In [8]:
data = pd.concat([data.iloc[:,0],data.iloc[:,idx_cols]], axis=1)
data.head()

Unnamed: 0,label,word_len_review,string_len_review,abit,absolute,absolutely,absolutley,absolutly,abundant,ac,...,yellow,yoga,yoghurt,yogurt,york,yuck,yummy,zero,zone,zoo
0,1,0.04158,0.040797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1,0.126299,0.122241,0.0,0.0,0.0,0.0,0.0,0.0,0.062472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.109148,0.102772,0.0,0.0,0.0,0.0,0.0,0.0,0.077393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.04262,0.041317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.095634,0.091922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
data.shape

(20491, 3340)

In [10]:
data['label']=data['label'].map({-1:0, 0:1, 1:2})
data

Unnamed: 0,label,word_len_review,string_len_review,abit,absolute,absolutely,absolutley,absolutly,abundant,ac,...,yellow,yoga,yoghurt,yogurt,york,yuck,yummy,zero,zone,zoo
0,2,0.041580,0.040797,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.126299,0.122241,0.0,0.0,0.00000,0.0,0.0,0.0,0.062472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.109148,0.102772,0.0,0.0,0.00000,0.0,0.0,0.0,0.077393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0.042620,0.041317,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0.095634,0.091922,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20486,2,0.053015,0.051200,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20487,2,0.016632,0.019469,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20488,0,0.029106,0.029650,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20489,0,0.402287,0.409675,0.0,0.0,0.04139,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Building

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [12]:
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(16392, 3339) (16392,) (4099, 3339) (4099,)


In [14]:
train_data = pd.concat([y_train, X_train], axis=1).reset_index()
train_data.drop('index', axis=1, inplace=True)
train_data

Unnamed: 0,label,word_len_review,string_len_review,abit,absolute,absolutely,absolutley,absolutly,abundant,ac,...,yellow,yoga,yoghurt,yogurt,york,yuck,yummy,zero,zone,zoo
0,2,0.033784,0.033068,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.017672,0.018429,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.091476,0.092888,0.0,0.092027,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0.012474,0.013004,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0.093035,0.088950,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16387,1,0.061850,0.059077,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16388,0,0.049376,0.047187,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16389,0,0.024428,0.024151,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16390,2,0.007277,0.007654,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X = train_data.iloc[:,1:]
y = train_data.iloc[:,0]

In [16]:
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=0.2, 
                                              random_state=1, stratify=y)
print(X_trn.shape, y_trn.shape, X_val.shape, y_val.shape)

(13113, 3339) (13113,) (3279, 3339) (3279,)


## LGBM

In [17]:
import lightgbm as lgb

In [42]:
d_trn = lgb.Dataset(X_trn, label=y_trn)
d_val = lgb.Dataset(X_val, label=y_val)
d_train = lgb.Dataset(X_train, label=y_train)
d_test = lgb.Dataset(X_test, label=y_test)
d_train

<lightgbm.basic.Dataset at 0x238a91da880>

In [None]:
'''para = {'boosting_type': 'gbdt','objective': 'multiclass','num_class':3,'subsample': 0.5,'subsample_freq': 1,
        'learning_rate': 0.03,'num_leaves': 2**11-1,'min_data_in_leaf': 2**12-1,'feature_fraction': 0.5,
        'max_bin': 100,'n_estimators': 500,'boost_from_average': False,"random_seed":42}'''

In [25]:
para = {
    'boosting':'gbdt',  # traditional Gradient Boosting Decision Tree
    'objective':'multiclass', # multi-class classification application  *softmax objective function*
    'num_class':3,
    'n_estimators':100, # number of boosting iterations
    'max_bin':255,  # max number of bins that feature values will be bucketed in
    'learning_rate':0.1,  # shrinkage rate
    'max_leaf_nodes':31, # max number of leaves in one tree
    'random_seed':1,
    'max_depth':-1,  # limit the max depth for tree model
    'min_samples_leaf':20,  # minimal number of data in one leaf. Can be used to deal with over-fitting
    'subsample':1,  # like feature_fraction, but this will randomly select part of data without resampling
    'feature_fraction':1,  # LightGBM will randomly select a subset of features on each iteration
    'metric':['multi_logloss', 'multi_error'],
#     'verbosity':5  # controls the level of LightGBM’s verbosity
}

In [26]:
model = lgb.train(para, train_set=d_trn, valid_sets=[d_val], num_boost_round=200)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184528
[LightGBM] [Info] Number of data points in the train set: 13113, number of used features: 2986
[LightGBM] [Info] Start training from score -1.852355
[LightGBM] [Info] Start training from score -2.239277
[LightGBM] [Info] Start training from score -0.305714
[1]	valid_0's multi_logloss: 0.706021	valid_0's multi_error: 0.263495
[2]	valid_0's multi_logloss: 0.671596	valid_0's multi_error: 0.263495
[3]	valid_0's multi_logloss: 0.64471	valid_0's multi_error: 0.263495
[4]	valid_0's multi_logloss: 0.622801	valid_0's multi_error: 0.26014
[5]	valid_0's multi_logloss: 0.604902	valid_0's multi_error: 0.238182
[6]	valid_0's multi_logloss: 0.590192	valid_0's multi_error: 0.221714
[7]	valid_0's multi_logloss: 0.576809	valid_0's multi_error: 0.21043
[8]	valid_0's multi_logloss: 0.565503	valid_0's multi_error: 0.204636
[9]	valid_0's multi_logloss: 0.555319	valid_0's multi_error: 0.198231
[10]	valid_0's multi_l

In [44]:
model = lgb.train(para, train_set=d_train, num_boost_round=200)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 215610
[LightGBM] [Info] Number of data points in the train set: 16392, number of used features: 3240
[LightGBM] [Info] Start training from score -1.852498
[LightGBM] [Info] Start training from score -2.238893
[LightGBM] [Info] Start training from score -0.305739


In [45]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
test_pred

array([[0.9319025 , 0.036423  , 0.0316745 ],
       [0.00377742, 0.02444441, 0.97177817],
       [0.00256905, 0.01190497, 0.98552598],
       ...,
       [0.00192   , 0.00219468, 0.99588532],
       [0.70931651, 0.24003072, 0.05065277],
       [0.00273488, 0.02117528, 0.97608983]])

In [47]:
y_train_pred = [np.argmax(pro) for pro in train_pred]

y_test_pred = [np.argmax(pro) for pro in test_pred]
# y_test_pred

In [48]:
print('Training Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_train, y_train_pred),3))
print('F1 Score: ',round(f1_score(y_train, y_train_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_train, y_train_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_train, y_train_pred, average='weighted'),3))

print('\nTesting Scores:-')
print('Accuracy Score: ',round(accuracy_score(y_test, y_test_pred),3))
print('F1 Score: ',round(f1_score(y_test, y_test_pred, average='weighted'),3))
print('Precision Score: ',round(precision_score(y_test, y_test_pred, average='weighted'),3))
print('Recall Score: ',round(recall_score(y_test, y_test_pred, average='weighted'),3))

Training Scores:-
Accuracy Score:  0.966
F1 Score:  0.964
Precision Score:  0.966
Recall Score:  0.966

Testing Scores:-
Accuracy Score:  0.849
F1 Score:  0.829
Precision Score:  0.823
Recall Score:  0.849
