In [1]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, roc_auc_score, f1_score, accuracy_score, classification_report

import os

In [None]:
"""
In this notebook, we use the LightGBM as baseline model for future comparison.
"""

In [2]:
data = pd.read_csv('./datasets/yeast4.csv')
data['Label'] = data['Label'].apply(lambda x: x.replace('positive', '1').replace('negative', '0')).astype('int')
# data['Label'] = data['Label'].astype('category')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1484 entries, 0 to 1483
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Label   1484 non-null   int64  
 1   Mcg     1484 non-null   float64
 2    Gvh    1484 non-null   float64
 3    Alm    1484 non-null   float64
 4    Mit    1484 non-null   float64
 5    Erl    1484 non-null   float64
 6    Pox    1484 non-null   float64
 7    Vac    1484 non-null   float64
 8    Nuc    1484 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 104.5 KB


In [3]:
y = data.Label
X = data.drop('Label', axis=1)

In [4]:
X

Unnamed: 0,Mcg,Gvh,Alm,Mit,Erl,Pox,Vac,Nuc
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22
3,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22
4,0.50,0.54,0.48,0.65,0.5,0.0,0.53,0.22
...,...,...,...,...,...,...,...,...
1479,0.36,0.48,0.61,0.26,0.5,0.0,0.50,0.16
1480,0.30,0.37,0.40,0.45,0.5,0.0,0.48,0.41
1481,0.38,0.40,0.39,0.19,0.5,0.0,0.46,0.62
1482,0.58,0.56,0.38,0.39,0.5,0.0,0.54,0.57


In [5]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1479    0
1480    0
1481    0
1482    0
1483    1
Name: Label, Length: 1484, dtype: int64

In [6]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)
train_data = lightgbm.Dataset(X_train, label=y_train)
val_data = lightgbm.Dataset(X_val, label=y_val)
test_data = lightgbm.Dataset(X_test, label=y_test)

In [34]:
def evaluate_model(X_test, y_test, model):
    y_pred = model.predict(X_test)
    print('Accuracy of the model: {}\n'.format(accuracy_score(y_test, y_pred)))
    print('Classification report: \n{}\n'.format(classification_report(y_test, y_pred)))

In [8]:
lgbm_param = {
        'n_estimators': 3000,
        'max_depth': 100,
        'objective': 'binary',
        'num_leaves': 400,
        'feature_fraction': 0.64,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'learning_rate':0.02,
        'verbose': -1,
        'force_col_wise': True,
        'is_unbalance': True,
        'boosting_type': 'gbdt',
}

lgb = lightgbm.LGBMClassifier(**lgbm_param)

In [9]:
lgb.fit(X_train, y_train)
y_v_pred = lgb.predict(X_val)
evaluate_model(y_val, y_v_pred)

Accuracy of the model: 0.957983193277311

Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       114
           1       0.50      0.40      0.44         5

    accuracy                           0.96       119
   macro avg       0.74      0.69      0.71       119
weighted avg       0.95      0.96      0.96       119




In [10]:
y_t_pred = lgb.predict(X_test)
evaluate_model(y_test, y_t_pred)

Accuracy of the model: 0.9562289562289562

Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       287
           1       0.20      0.10      0.13        10

    accuracy                           0.96       297
   macro avg       0.58      0.54      0.56       297
weighted avg       0.94      0.96      0.95       297




In [11]:
#split the data
y_ntest = y_test.reset_index(drop=True)
z = []
idx_dic = y_ntest.to_dict()
idx_dic
for k, p in idx_dic.items():
    if p == 1:
        z.append(k)
X_ntest = X_test.reset_index(drop=True)
X_mtest = X_ntest.iloc[z]
y_mtest = y_ntest.iloc[z]

In [12]:
y_m_pred = lgb.predict(X_mtest)
evaluate_model(y_mtest, y_m_pred)

Accuracy of the model: 0.1

Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.10      0.18        10

    accuracy                           0.10        10
   macro avg       0.50      0.05      0.09        10
weighted avg       1.00      0.10      0.18        10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
y_m_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [15]:
# dataset2
data1 = pd.read_csv('./datasets/winequality.csv')
data1['Label'] = data1['Label'].apply(lambda x: x.replace('positive', '1').replace('negative', '0')).astype('int')
# data['Label'] = data['Label'].astype('category')
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Label                1599 non-null   int64  
 1   FixedAcidity         1599 non-null   float64
 2    VolatileAcidity     1599 non-null   float64
 3    CitricAcid          1599 non-null   float64
 4    ResidualSugar       1599 non-null   float64
 5    Chlorides           1599 non-null   float64
 6    FreeSulfurDioxide   1599 non-null   float64
 7    TotalSulfurDioxide  1599 non-null   float64
 8    Density             1599 non-null   float64
 9    PH                  1599 non-null   float64
 10   Sulphates           1599 non-null   float64
 11   Alcohol             1599 non-null   float64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [16]:
y1 = data1.Label
X1 = data1.drop('Label', axis=1)
X1

Unnamed: 0,FixedAcidity,VolatileAcidity,CitricAcid,ResidualSugar,Chlorides,FreeSulfurDioxide,TotalSulfurDioxide,Density,PH,Sulphates,Alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [17]:
y1

0       0
1       0
2       0
3       0
4       0
       ..
1594    0
1595    0
1596    0
1597    0
1598    0
Name: Label, Length: 1599, dtype: int64

In [18]:
X1, X1_test, y1, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size=0.1, random_state=1)
train_data1 = lightgbm.Dataset(X1_train, label=y1_train)
val_data1 = lightgbm.Dataset(X1_val, label=y1_val)
test_data1 = lightgbm.Dataset(X1_test, label=y1_test)

In [19]:
lgbm_param1 = {
        'n_estimators': 500,
        'max_depth': 1000,
        'objective': 'binary',
        'num_leaves': 1000,
        'feature_fraction': 0.64,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'learning_rate':0.05,
        'verbose': -1,
        'force_col_wise': True,
        'is_unbalance': True,
        'boosting_type': 'gbdt',
}

lgb1 = lightgbm.LGBMClassifier(**lgbm_param1)
lgb1.fit(X1_train, y1_train)
y1_v_pred = lgb1.predict(X1_val)
evaluate_model(y1_val, y1_v_pred)

Accuracy of the model: 0.953125

Classification report: 
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       122
           1       0.50      0.17      0.25         6

    accuracy                           0.95       128
   macro avg       0.73      0.58      0.61       128
weighted avg       0.94      0.95      0.94       128




In [20]:
y1_ntest = y1_test.reset_index(drop=True)
z = []
idx_dic1 = y1_ntest.to_dict()
for k, p in idx_dic1.items():
    if p == 1:
        z.append(k)
X1_ntest = X1_test.reset_index(drop=True)
X1_mtest = X1_ntest.iloc[z]
y1_mtest = y1_ntest.iloc[z]

In [22]:
y1_m_pred = lgb1.predict(X1_mtest)
evaluate_model(y1_mtest, y1_m_pred)

Accuracy of the model: 0.0

Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00      10.0

    accuracy                           0.00      10.0
   macro avg       0.00      0.00      0.00      10.0
weighted avg       0.00      0.00      0.00      10.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
y1_m_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [31]:
"""
We now try to use the Random OverSampling technique to resolve the imbalance problem.
"""
train_set = pd.concat([X_train, y_train], axis=1)
count_0, count_1 = train_set.Label.value_counts()
data_0 = train_set[train_set['Label'] == 0]
data_1 = train_set[train_set['Label'] == 1]

data_1_over = data_1.sample(count_0, replace=True)
data_over = pd.concat([data_0, data_1_over], axis=0)

print(data_over.Label.value_counts())

0    1032
1    1032
Name: Label, dtype: int64


In [32]:
y_over = data_over.Label
X_over = data_over.drop('Label', axis=1)

In [95]:
lgbm_param2 = {
        'n_estimators': 1000,
        'max_depth': 1000,
        'objective': 'binary',
        'num_leaves': 100,
        'learning_rate':0.01,
        'verbose': -1,
        'force_col_wise': True,
        'is_unbalance': True,
        'boosting_type': 'gbdt',
}

lgb2 = lightgbm.LGBMClassifier(**lgbm_param2)
lgb2.fit(X_over, y_over)
evaluate_model(X_val, y_val, lgb2)

Accuracy of the model: 0.9831932773109243

Classification report: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       114
           1       1.00      0.60      0.75         5

    accuracy                           0.98       119
   macro avg       0.99      0.80      0.87       119
weighted avg       0.98      0.98      0.98       119




In [96]:
evaluate_model(X_test, y_test, lgb2)

Accuracy of the model: 0.9629629629629629

Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       287
           1       0.40      0.20      0.27        10

    accuracy                           0.96       297
   macro avg       0.69      0.59      0.62       297
weighted avg       0.95      0.96      0.96       297




In [97]:
evaluate_model(X_mtest, y_mtest, lgb2)

Accuracy of the model: 0.2

Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.20      0.33        10

    accuracy                           0.20        10
   macro avg       0.50      0.10      0.17        10
weighted avg       1.00      0.20      0.33        10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
lgb2.predict(X_mtest) # although useful to some extent, the result is bad

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0])

In [99]:
from data_manipulation.random_sampling import ros
data1_over = ros(X1_train, y1_train)
y1_over = data1_over.Label
X1_over = data1_over.drop('Label', axis=1)

0    1114
1    1114
Name: Label, dtype: int64


In [112]:
lgbm_param3 = {
        'n_estimators': 500,
        'max_depth': 1000,
        'objective': 'binary',
        'num_leaves': 500,
        'learning_rate':0.02,
        'verbose': -1,
        'force_col_wise': True,
        'is_unbalance': True,
        'boosting_type': 'gbdt',
}

lgb3 = lightgbm.LGBMClassifier(**lgbm_param3)
lgb3.fit(X1_over, y1_over)
evaluate_model(X1_val, y1_val, lgb3)

Accuracy of the model: 0.9375

Classification report: 
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       122
           1       0.25      0.17      0.20         6

    accuracy                           0.94       128
   macro avg       0.60      0.57      0.58       128
weighted avg       0.93      0.94      0.93       128




In [109]:
evaluate_model(X1_mtest, y1_mtest, lgb3) # does not work, even hurt the performance on the whole dataset

Accuracy of the model: 0.0

Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00      10.0

    accuracy                           0.00      10.0
   macro avg       0.00      0.00      0.00      10.0
weighted avg       0.00      0.00      0.00      10.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
