In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import contractions
import collections
import shap
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.dummy import DummyRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from lime.lime_text import LimeTextExplainer
import timeit
import shap
import eli5
import pickle

In [2]:
# load the nltk specific parameters
stop_words = nltk.corpus.stopwords.words('english')

In [3]:
X_ = pd.read_csv('X_training_all_hurricanes.csv')
y = pd.read_csv('y_labels_all_hurricanes.csv')

In [4]:
# select or combine lemmatization, stemming, stopword removal to compare performance

def preprocess_text(text, flg_stemm = True, flg_lemm = True, lst_stopwords=None):
    text_clean = re.sub(r'[^\w\s.,]', '', str(text).strip())
    lst_text = text_clean.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
    if flg_stemm == True:
        stemm = nltk.stem.porter.PorterStemmer()
        lst_text = [stemm.stem(word) for word in lst_text]
    if flg_lemm == True:
        lem=nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    text_clean = ' '.join(filter(None, lst_text))
    text_clean = text_clean.replace(" ,",",").replace(' .', '.')
    text_clean = contractions.fix(text_clean)
    return text_clean

In [5]:
X = X_['text'].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=True, lst_stopwords=stop_words))
X.shape

(26418,)

In [6]:
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42, test_size=0.20, shuffle=True)

tfidf_vec = TfidfVectorizer(ngram_range=(1,2), lowercase=True, min_df = 3)
train_vc = tfidf_vec.fit_transform(X_train)
test_vc = tfidf_vec.transform(X_test)

## Implement the model
The bake-off suggests that the deaful parameters on the SVM were optiminal for the YES/NO task

In [7]:
svc = SVC(random_state=42)

In [8]:
y_train = y_train['is_hurricane']
svc.fit(train_vc, y_train)

In [9]:
# svc prediction
preds_svc = svc.predict(test_vc)

### With parameters from previous examination

In [10]:
svc_p = SVC(C=10, gamma=1, kernel='rbf', random_state=42)

In [11]:
svc_p.fit(train_vc, y_train)

In [12]:
preds_svcp = svc_p.predict(test_vc)

### Linear SVC

In [13]:
l_svc = LinearSVC(random_state=42)

In [14]:
l_svc.fit(train_vc, y_train)

In [15]:
preds_lsvc = l_svc.predict(test_vc)

## Show comparison

In [16]:
results_df = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1-score'])
target_names = ['Not Hurricane Tweet', 'Hurricane Tweet']

In [17]:
models = ['SVC', 'SVC+', 'LinearSVC']
predictions = [preds_svc, preds_svcp, preds_lsvc]
config_obj = zip(models, predictions)

In [18]:
for name, preds in config_obj:
    report_dict = metrics.classification_report(y_test, preds, target_names=target_names, output_dict=True)
    results_df.loc[name, 'accuracy'] = report_dict['accuracy']
    results_df.loc[name, 'precision'] = report_dict['macro avg']['precision']
    results_df.loc[name, 'recall'] = report_dict['macro avg']['recall']
    results_df.loc[name, 'f1-score'] = report_dict['macro avg']['f1-score']

In [19]:
results_df

Unnamed: 0,accuracy,precision,recall,f1-score
SVC,0.981453,0.981561,0.981508,0.981453
SVC+,0.98221,0.982318,0.982265,0.98221
LinearSVC,0.981264,0.981327,0.981308,0.981264


In [20]:
with open('./WINNING_MODEL/yes_no_svc_plus.pkl', 'wb') as file:
    pickle.dump(svc_p, file)