In [1]:
import sys
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('axes', grid=True)

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
root_dir = 'C:/Users/delst/OneDrive/Desktop/Code/Workspace/NLP_Disaster_Tweets'
sys.path.append(root_dir)

from A_Root_Dir.Configurations.setup_env import setup_environment
config = setup_environment(root_dir)

# File Paths
sdo_pkl = config.sdo_pkl
sdo_parq = config.sdo_parq

# Class Imports
from Modularization.model_select import ModelSelectionReport

---

In [3]:
filename = 'f1_preprocessing_train.parquet'
path_to_parq_store = os.path.join(sdo_parq, filename)

df = pd.read_parquet(path_to_parq_store)

In [4]:
glove_features_tokens_file_path = os.path.join(sdo_pkl, 'glove_features_tokens.pkl')
glove_features_keyword_file_path = os.path.join(sdo_pkl, 'glove_features_keyword.pkl')
numerical_features_file_path = os.path.join(sdo_pkl, 'numerical_features.pkl')

with open(glove_features_tokens_file_path, 'rb') as file:
    glove_features_tokens = pickle.load(file)
    
with open(glove_features_keyword_file_path, 'rb') as file:
    glove_features_keyword = pickle.load(file)

with open(numerical_features_file_path, 'rb') as file:
    numerical_features = pickle.load(file)

In [5]:
print(np.shape(glove_features_tokens))
print(np.shape(glove_features_keyword))
print(np.shape(numerical_features))

(7552, 25)
(7552, 25)
(7552, 8)


---

In [6]:
numerical_features_dropped = np.delete(numerical_features, 7, axis=1)
numerical_features_dropped

array([[ 0.    ,  1.    ,  0.    , ...,  4.    , 32.    ,  0.    ],
       [ 2.    ,  0.    ,  0.    , ...,  9.    , 43.    ,  0.    ],
       [ 1.    ,  0.    ,  0.    , ...,  8.    , 56.    ,  0.    ],
       ...,
       [ 3.    ,  0.    ,  0.    , ..., 11.    , 67.    ,  0.    ],
       [ 0.    ,  1.    ,  0.    , ..., 12.    , 79.    ,  0.6249],
       [ 0.    ,  0.    ,  0.    , ...,  9.    , 53.    ,  0.    ]])

In [7]:
X_train = np.concatenate((glove_features_tokens, glove_features_keyword, numerical_features_dropped), axis=1)
Y_train = df['target'].values

In [8]:
print(np.shape(X_train))
print(np.shape(Y_train))

(7552, 57)
(7552,)


In [9]:
numerical_features

array([[ 0.    ,  1.    ,  0.    , ..., 32.    ,  0.    ,  1.    ],
       [ 2.    ,  0.    ,  0.    , ..., 43.    ,  0.    ,  0.    ],
       [ 1.    ,  0.    ,  0.    , ..., 56.    ,  0.    ,  1.    ],
       ...,
       [ 3.    ,  0.    ,  0.    , ..., 67.    ,  0.    ,  0.    ],
       [ 0.    ,  1.    ,  0.    , ..., 79.    ,  0.6249,  0.    ],
       [ 0.    ,  0.    ,  0.    , ..., 53.    ,  0.    ,  0.    ]])

---

In [10]:
models = {
    # 'MultinomialNB': MultinomialNB(),
    'SVM': SVC(),
    'SGD': SGDClassifier(),
    'RandomForest': RandomForestClassifier(max_depth=8, n_estimators=120),
    'GBM': GradientBoostingClassifier(),
    'LGBM': LGBMClassifier(),
    'XGB': XGBClassifier()
}

In [13]:
skf = StratifiedKFold(n_splits=9, random_state=42, shuffle=True)

classification_report = ModelSelectionReport(models, X_train, Y_train, skf)
classification_report.evaluate_models()
classification_report.print_report()

Model: SVM
accuracy: 0.6863
precision: 0.7012
recall: 0.6863
f1_score: 0.6628
training_time: 27.4612 seconds

Model: SGD
accuracy: 0.6777
precision: 0.7464
recall: 0.6777
f1_score: 0.6383
training_time: 1.0570 seconds

Model: RandomForest
accuracy: 0.7893
precision: 0.7930
recall: 0.7893
f1_score: 0.7850
training_time: 22.2316 seconds

Model: GBM
accuracy: 0.7962
precision: 0.7971
recall: 0.7962
f1_score: 0.7936
training_time: 75.3748 seconds

Model: LGBM
accuracy: 0.8048
precision: 0.8058
recall: 0.8048
f1_score: 0.8024
training_time: 2.1031 seconds

Model: XGB
accuracy: 0.7975
precision: 0.7974
recall: 0.7975
f1_score: 0.7957
training_time: 6.3727 seconds

