In [None]:
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)

from google.colab import drive
drive.mount('/content/drive/')
import pandas as pd
import os
os.chdir('/content/drive/MyDrive')

Mounted at /content/drive/


In [None]:
cd CZ4034/

/content/drive/MyDrive/CZ4034


In [None]:
df_train = pd.read_csv("./data/shared_data_train_subj.csv")
df_test = pd.read_csv("./data/shared_data_test_subj.csv")

# Train Stacked Model


In [None]:
SVM = pd.read_csv("./results/svm_pred_train_subj.csv")
XGB = pd.read_csv("./results/xgb_pred_train_subj.csv")
NAV = pd.read_csv("./results/bayes_pred_train_subj.csv")

In [None]:
SVM.iloc[:,1]

0      0
1      0
2      1
3      0
4      1
      ..
775    1
776    0
777    1
778    0
779    0
Name: 0, Length: 780, dtype: int64

In [None]:
train_X = pd.concat([SVM.iloc[:,1],NAV.iloc[:,1],XGB.iloc[:,1]],axis=1)
train_y = df_train.label

In [None]:
SVM_test = pd.read_csv("./results/svm_pred_test_subj.csv")
XGB_test = pd.read_csv("./results/xgb_pred_test_subj.csv")
NAV_test = pd.read_csv("./results/bayes_pred_test_subj.csv")

In [None]:
test_X = pd.concat([SVM_test.iloc[:,1],NAV_test.iloc[:,1],XGB_test.iloc[:,1]],axis=1)
test_y = df_test.label

In [None]:
train_X.columns = ['SVM','NAV','XGB']
test_X.columns = ['SVM','NAV','XGB']

# Majority Voting

In [None]:
sum(train_X.iloc[0,:])

0

In [None]:
vote = []
for idx, row in test_X.iterrows():
  if sum(row) >= 2:
    vote.append(1)
  else:
    vote.append(0)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
class_names = ['neutral', 'polarised']
print(classification_report(test_y, vote, target_names=class_names,digits = 3))

              precision    recall  f1-score   support

     neutral      0.641     0.682     0.661       110
   polarised      0.660     0.618     0.638       110

    accuracy                          0.650       220
   macro avg      0.651     0.650     0.650       220
weighted avg      0.651     0.650     0.650       220



# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_X, train_y)
LR_pred = model.predict(test_X)
from sklearn.metrics import confusion_matrix, classification_report
class_names = ['neutral', 'polarised']
print(classification_report(test_y, LR_pred, target_names=class_names,digits = 3))

              precision    recall  f1-score   support

     neutral      0.630     0.682     0.655       110
   polarised      0.653     0.600     0.626       110

    accuracy                          0.641       220
   macro avg      0.642     0.641     0.640       220
weighted avg      0.642     0.641     0.640       220



# XGB

In [None]:
!pip install xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# A parameter grid for XGBoost
# can add more range
params = {
    'eta': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'min_child_weight': [1, 5, 10],
}

xgb = XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1)

folds = 3

grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='accuracy', cv=folds, verbose=3)
xgb.fit(train_X, train_y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
xgb_pred = xgb.predict(test_X)
from sklearn.metrics import confusion_matrix, classification_report
class_names = ['neutral', 'polarised']
print(classification_report(test_y, xgb_pred, target_names=class_names,digits = 3))

              precision    recall  f1-score   support

     neutral      0.630     0.682     0.655       110
   polarised      0.653     0.600     0.626       110

    accuracy                          0.641       220
   macro avg      0.642     0.641     0.640       220
weighted avg      0.642     0.641     0.640       220



In [None]:
xgb_pred = grid.predict(test_X)
from sklearn.metrics import confusion_matrix, classification_report
class_names = ['neutral', 'polarised']
print(classification_report(test_y, xgb_pred, target_names=class_names))

              precision    recall  f1-score   support

     neutral       0.63      0.68      0.66       110
   polarised       0.65      0.60      0.63       110

    accuracy                           0.64       220
   macro avg       0.64      0.64      0.64       220
weighted avg       0.64      0.64      0.64       220



# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# A parameter grid for XGBoost
# can add more range
# params = {'bootstrap': [True, False],
#  'max_depth': [10, 20, 30, None],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 4],
#  'min_samples_split': [2, 5, 10],
#  'n_estimators': [200, 400, 600, 800, 1000]}

rf_model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              random_state=None, verbose=0,
                                              warm_start=False)

folds = 3

# grid = GridSearchCV(estimator=rf_model, param_grid=params, scoring='accuracy', cv=folds, verbose=3)
rf_model.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
pred_y = rf_model.predict(test_X)
from sklearn.metrics import confusion_matrix, classification_report
class_names = ['neutral', 'polarised']
print(classification_report(test_y, pred_y, target_names=class_names,digits = 3))

              precision    recall  f1-score   support

     neutral      0.630     0.682     0.655       110
   polarised      0.653     0.600     0.626       110

    accuracy                          0.641       220
   macro avg      0.642     0.641     0.640       220
weighted avg      0.642     0.641     0.640       220

