In [206]:
#Importing the neccesary packages

#Numpy packages
import numpy as np
import pandas as pd

#NLTK Packages
import nltk
from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer 

#Sklearn Packages
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

#Imblearn Packages
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

#String Packages
import re
import string

#Word2Vec and simple transformers for RoBERTa
from gensim.models import word2vec
from simpletransformers.classification import ClassificationModel

import pickle

#Instantiating NLTK Stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brettzimmerman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing and Concatonating the Data

In [207]:
harvey = pd.read_csv('data/harvey.tsv', sep='\t')
irma = pd.read_csv('data/irma.tsv', sep='\t')
matthew = pd.read_csv('data/matthew.tsv', sep='\t')
maria = pd.read_csv('data/maria.tsv', sep='\t')
twint = pd.read_csv('data/twint.csv')

In [208]:
#Checking for Null values
canes = [harvey, irma, matthew, maria, twint]
for i in canes:
    print(i.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6378 entries, 0 to 6377
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   tweet_id     6378 non-null   int64 
 1   tweet_text   6378 non-null   object
 2   class_label  6378 non-null   object
dtypes: int64(1), object(2)
memory usage: 149.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6579 entries, 0 to 6578
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   tweet_id     6579 non-null   int64 
 1   tweet_text   6579 non-null   object
 2   class_label  6579 non-null   object
dtypes: int64(1), object(2)
memory usage: 154.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   tweet_id     1157 non-null   int64 
 1   tweet_text   1157 non-null   o

In [209]:
#Creating dataframe hurricanes, concatonating all the previous dataframes together
hurricanes = pd.concat(canes)

In [210]:
hurricanes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19240 entries, 0 to 31
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   tweet_id     19208 non-null  float64
 1   tweet_text   19240 non-null  object 
 2   class_label  19240 non-null  object 
dtypes: float64(1), object(2)
memory usage: 601.2+ KB


In [211]:
#Checking the distribution of Target classes
hurricanes['class_label'].value_counts()

rescue_volunteering_or_donation_effort    4701
other_relevant_information                4214
infrastructure_and_utility_damage         3293
sympathy_and_support                      1587
injured_or_dead_people                    1482
displaced_people_and_evacuations          1129
caution_and_advice                         987
not_humanitarian                           959
requests_or_urgent_needs                   856
emergency                                   32
Name: class_label, dtype: int64

# Data Cleaning

relief = displaced_people_and_evacuations, rescue_volunteering_or_donation_effort, caution_and_advice
danger = injured_or_dead_people, infrastructure_and_utility_damage
emergencies = requests_or_urgent_needs

In [212]:
#There are too many target classes for my ideal model
#I will merge these nine targets down to four
hurricanes['class_label'] = hurricanes['class_label'].replace({'displaced_people_and_evacuations': 'relief'})
hurricanes['class_label'] = hurricanes['class_label'].replace({'rescue_volunteering_or_donation_effort': 'relief'})

hurricanes['class_label'] = hurricanes['class_label'].replace({'caution_and_advice': 'danger'})
hurricanes['class_label'] = hurricanes['class_label'].replace({'injured_or_dead_people': 'danger'})
hurricanes['class_label'] = hurricanes['class_label'].replace({'infrastructure_and_utility_damage': 'danger'})

hurricanes['class_label'] = hurricanes['class_label'].replace({'requests_or_urgent_needs': 'emergency'})

hurricanes['class_label'] = hurricanes['class_label'].replace({'other_relevant_information': 'other'})
hurricanes['class_label'] = hurricanes['class_label'].replace({'sympathy_and_support': 'other'})
hurricanes['class_label'] = hurricanes['class_label'].replace({'not_humanitarian': 'other'})

#Exporting csv file to use for visualizations
hurricanes.to_csv('hurricanes.csv', index=False)

In [213]:
#Much more manageable
hurricanes['class_label'].value_counts()

other        6760
relief       5830
danger       5762
emergency     888
Name: class_label, dtype: int64

In [214]:
#Here, I am label encoding the targets from 0-4 so it can be used by various machine learning models.
le = LabelEncoder()
le.fit(hurricanes['class_label'])
hurricanes['label'] = le.transform(hurricanes['class_label'])
hurricanes.head()

Unnamed: 0,tweet_id,tweet_text,class_label,label
0,9.033888e+17,"Hurricane Harvey killed at least 38 people, bu...",danger,0
1,9.011364e+17,Harvey upped to Category 2 hurricane with 110+...,other,2
2,9.028537e+17,A huge shoutout @TexasGuard for all the work y...,relief,3
3,9.028597e+17,Our thoughts and prayers are with the people h...,other,2
4,9.01406e+17,Homes destroyed on Broadway St in Rockport. Pe...,danger,0


In [215]:
#There is an unbalanced distribution of targets. Therefore, when modeling, I will be trying to maximize F1 Score.
hurricanes['label'].value_counts(normalize=True)

2    0.351351
3    0.303015
0    0.299480
1    0.046154
Name: label, dtype: float64

In [216]:
#Creating a function to clean the Tweets with Regex
def remove_junk(text):
    text = re.sub(r'@[A-Za-z0-9#]+', '', str(text)) #remove @mentions 
    text = re.sub(r'RT[\s]+', '', str(text)) # remove RT
    text = re.sub(r'https?:\/\/\S+', '', text) # removes actual links
    text = re.sub(r'#', '', str(text)) # remove hashtag symbol
    return text

In [217]:
#Creating a function to tokenize text
def tokenize(text):
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    tokenized_tweet = regex_token.tokenize(text)
    tweet_tokens = [word.lower() for word in tokenized_tweet]
    return tweet_tokens

In [218]:
#Adding punctuation and stopwords to my list so they wont influence the sentiment analysis
new_stopwords = ['hurricane', 'harvey', 'irma', 'matthew', 'maria', 'hurricaneharvey', 'hurricanemaria', 'hurricaneirma']
punctuations = string.punctuation
stopwords.extend(new_stopwords)
stopwords.extend(punctuations)

#Creating a function that removes the previously defined stopwords
def remove_sw_punct(tweet_tokens):
    tweets_clean = [word for word in tweet_tokens if word not in stopwords]
    return tweets_clean

In [219]:
#Creating a function that stems the tokens down to its root
stemmer = PorterStemmer()

def stemming(tweets_clean):
    tweets_stem = [stemmer.stem(token) for token in tweets_clean]         
    return tweets_stem

In [220]:
#Creating a lemmatization function
def pos_tagger(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
    words_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]   
    lemma_list = [wd.lemmatize(tag) for wd, tag in words_tags]
    return lemma_list

In [221]:
#Combining the preprocessing into a function for lemmatized text
def lemma_tweet(text):
    processed_tweet = remove_junk(text)
    tweet_tokens = pos_tagger(processed_tweet)
    tweet_lemma = [word.lower() for word in tweet_tokens]
    tweets_lemma = ' '.join(tweet_lemma)
    return tweets_lemma

In [222]:
#Here, I am creating three new columns.
#model_text is partly processed text that will be fed into the model.
#model_text_lemma is also partly processed text, but lemmatization is added.
hurricanes['model_text'] = hurricanes['tweet_text'].apply(remove_junk)
hurricanes['model_text_lemma'] = hurricanes['tweet_text'].apply(lemma_tweet)

In [223]:
#hurricanes dataframe with the three new columns
hurricanes

Unnamed: 0,tweet_id,tweet_text,class_label,label,model_text,model_text_lemma
0,9.033888e+17,"Hurricane Harvey killed at least 38 people, bu...",danger,0,"Hurricane Harvey killed at least 38 people, bu...",hurricane harvey kill at least 38 people but c...
1,9.011364e+17,Harvey upped to Category 2 hurricane with 110+...,other,2,Harvey upped to Category 2 hurricane with 110+...,harvey up to category 2 hurricane with 110+ mp...
2,9.028537e+17,A huge shoutout @TexasGuard for all the work y...,relief,3,A huge shoutout for all the work your USArmy ...,a huge shoutout for all the work your usarmy s...
3,9.028597e+17,Our thoughts and prayers are with the people h...,other,2,Our thoughts and prayers are with the people h...,our thought and prayer be with the people hit ...
4,9.014060e+17,Homes destroyed on Broadway St in Rockport. Pe...,danger,0,Homes destroyed on Broadway St in Rockport. Pe...,homes destroy on broadway st in rockport peopl...
...,...,...,...,...,...,...
27,,Any LI volunteer FFs who can help Breezy Point...,emergency,1,Any LI volunteer FFs who can help Breezy Point...,any li volunteer ffs who can help breezy point...
28,,@GovChristie We're working on putting an army ...,emergency,1,We're working on putting an army together of ...,we 're work on put an army together of everyda...
29,,"Hoping my Hoboken friend, Maria (macabfilms), ...",emergency,1,"Hoping my Hoboken friend, Maria (macabfilms), ...",hoping my hoboken friend maria macabfilms be s...
30,,Quick! I need a title for this Hurricane Sandy...,emergency,1,Quick! I need a title for this Hurricane Sandy...,quick i need a title for this hurricane sandy ...


# Modeling

In [224]:
#Performing a train test split. The model_text column will be used here.
X = hurricanes.drop('label', axis=1)
y = hurricanes['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, train_size = .8)

# Baseline Model Dummy Classifier

In [226]:
dum_pipe = Pipeline([('count', CountVectorizer()),
                    ('model', DummyClassifier(strategy='stratified'))])

model = dum_pipe.fit(X_train['model_text'], y_train)

y_trn_pred = dum_pipe.predict(X_train['model_text'])
y_tst_pred = dum_pipe.predict(X_test['model_text'])
    
print('\t\tThe Train Results')
print(classification_report(y_train, y_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, y_tst_pred))

		The Train Results
              precision    recall  f1-score   support

           0       0.31      0.30      0.30      4610
           1       0.06      0.06      0.06       710
           2       0.35      0.35      0.35      5408
           3       0.30      0.30      0.30      4664

    accuracy                           0.31     15392
   macro avg       0.25      0.25      0.25     15392
weighted avg       0.31      0.31      0.31     15392


		The Test Results
              precision    recall  f1-score   support

           0       0.32      0.31      0.31      1152
           1       0.06      0.06      0.06       178
           2       0.36      0.35      0.36      1352
           3       0.31      0.32      0.32      1166

    accuracy                           0.32      3848
   macro avg       0.26      0.26      0.26      3848
weighted avg       0.32      0.32      0.32      3848



# Simple Model: Logistic Regression

In [73]:
lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize)),
    ('smt', SMOTE(random_state=30)),
    ('lr', LogisticRegression()),
])

lr_model = lr_pipe.fit(X_train['model_text'], y_train)
    
y_trn_pred = lr_model.predict(X_train['model_text'])
y_tst_pred = lr_model.predict(X_test['model_text'])
    
print('\t\tThe Train Results')
print(classification_report(y_train, y_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, y_tst_pred))
print('Test Score: ', lr_model.score(X_test['model_text'], y_test))
cv_score = cross_val_score(lr_model, X_train['model_text'], y_train)
print('Cross Validation Score: ', cv_score.mean())


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


		The Train Results
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      4610
           1       0.65      0.93      0.77       710
           2       0.88      0.85      0.87      5408
           3       0.91      0.90      0.91      4664

    accuracy                           0.89     15392
   macro avg       0.84      0.90      0.86     15392
weighted avg       0.89      0.89      0.89     15392


		The Test Results
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      1152
           1       0.49      0.71      0.58       178
           2       0.75      0.74      0.74      1352
           3       0.82      0.79      0.80      1166

    accuracy                           0.79      3848
   macro avg       0.73      0.77      0.75      3848
weighted avg       0.79      0.79      0.79      3848

Test Score:  0.7853430353430353


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross Validation Score:  0.779755774988335


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Gridsearch on a Logistic Regression Model

In [74]:
tf_lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize)),
    ('smt', SMOTE(random_state=30)),
    ('lr', LogisticRegression()),
])
parameters = {
    'tfidf__max_df': (0.25, 0.75),
    'tfidf__min_df': (2, 3),
    'tfidf__ngram_range': [(1, 1),(1, 2)],
    'tfidf__max_features': (8000, 10000, 70000),
    'lr__C': (.7, 1, 1.5),
    'lr__class_weight' : (['balanced']),
    'lr__solver': ('newton-cg', 'sag', 'saga', 'lbfgs'),
    'lr__max_iter': (100, 200, 300),
    'smt__k_neighbors' : (2, 5)
}

grid_search = GridSearchCV(tf_lr_pipe, parameters, cv=2, n_jobs=2, verbose=3, scoring = 'f1_macro')
grid_search.fit(X_train['model_text'], y_train)

tf_lr_best = grid_search.best_estimator_

y_trn_pred = tf_lr_best.predict(X_train['model_text'])
y_tst_pred = tf_lr_best.predict(X_test['model_text'])

print('\t\tThe Train Results')
print(classification_report(y_train, y_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, y_tst_pred))

Fitting 2 folds for each of 1728 candidates, totalling 3456 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   18.1s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  5.1min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  8.3min
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed: 12.0min
[Parallel(n_jobs=2)]: Done 1564 tasks      | elapsed: 16.3min
[Parallel(n_jobs=2)]: Done 2044 tasks      | elapsed: 22.0min
[Parallel(n_jobs=2)]: Done 2588 tasks      | elapsed: 28.1min
[Parallel(n_jobs=2)]: Done 3196 tasks      | elapsed: 36.7min
[Parallel(n_jobs=2)]: Done 3456 out of 3456 | elapsed: 40.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternativ

		The Train Results
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4610
           1       0.71      0.97      0.82       710
           2       0.91      0.86      0.88      5408
           3       0.92      0.92      0.92      4664

    accuracy                           0.90     15392
   macro avg       0.87      0.92      0.89     15392
weighted avg       0.91      0.90      0.90     15392


		The Test Results
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      1152
           1       0.52      0.69      0.59       178
           2       0.75      0.75      0.75      1352
           3       0.83      0.79      0.81      1166

    accuracy                           0.79      3848
   macro avg       0.74      0.77      0.75      3848
weighted avg       0.80      0.79      0.79      3848



In [157]:
#Saving the best parameters according to the gridsearch
with open('logistic_regression.pickle', 'wb') as f:
    pickle.dump(tf_lr_best, f)

# Grid Search on MultinomialNB Classifier

In [236]:
tf_nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize)),
    ('smt', SMOTE(random_state=30)),
    ('mnb', MultinomialNB()),
])
parameters = {
    'tfidf__max_df': (0.25, 0.75),
    'tfidf__min_df': (2, 3),
    'tfidf__ngram_range': [(1, 1),(1, 2)],
    'tfidf__max_features': (8000, 10000, 70000),
    'mnb__alpha': (.05, .5, 2),
    'smt__k_neighbors' : (2, 5)
}

grid_search = GridSearchCV(tf_nb_pipe, parameters, cv=2, n_jobs=2, verbose=3, scoring = 'f1_macro')
grid_search.fit(X_train['model_text'], y_train)

tf_nb_best = grid_search.best_estimator_

y_trn_pred = tf_nb_best.predict(X_train['model_text'])
y_tst_pred = tf_nb_best.predict(X_test['model_text'])
    
print('\t\tThe Train Results')
print(classification_report(y_train, y_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, y_tst_pred))

Fitting 2 folds for each of 144 candidates, totalling 288 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    9.1s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   41.2s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 288 out of 288 | elapsed:  1.6min finished


		The Train Results
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      4610
           1       0.52      0.98      0.68       710
           2       0.92      0.75      0.83      5408
           3       0.88      0.89      0.89      4664

    accuracy                           0.86     15392
   macro avg       0.80      0.89      0.82     15392
weighted avg       0.88      0.86      0.86     15392


		The Test Results
              precision    recall  f1-score   support

           0       0.77      0.84      0.80      1152
           1       0.37      0.77      0.50       178
           2       0.73      0.56      0.63      1352
           3       0.76      0.77      0.77      1166

    accuracy                           0.72      3848
   macro avg       0.66      0.73      0.68      3848
weighted avg       0.73      0.72      0.72      3848



# Trying the best nb parameters on a model using text cleaned with lemmatization

In [237]:
lemma_model = tf_nb_best.fit(X_train['model_text_lemma'], y_train)
    
y_trn_pred = lemma_model.predict(X_train['model_text_lemma'])
y_tst_pred = lemma_model.predict(X_test['model_text_lemma'])
    
print('\t\tThe Train Results')
print(classification_report(y_train, y_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, y_tst_pred))
print('Test Score: ', tf_nb_best.score(X_test['model_text_lemma'], y_test))
cv_score = cross_val_score(tf_nb_best, X_train['model_text_lemma'], y_train)
print('Cross Validation Score: ', cv_score.mean())


		The Train Results
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4610
           1       0.52      0.98      0.68       710
           2       0.92      0.75      0.82      5408
           3       0.87      0.89      0.88      4664

    accuracy                           0.85     15392
   macro avg       0.80      0.89      0.82     15392
weighted avg       0.87      0.85      0.86     15392


		The Test Results
              precision    recall  f1-score   support

           0       0.77      0.84      0.81      1152
           1       0.36      0.75      0.49       178
           2       0.74      0.57      0.64      1352
           3       0.76      0.77      0.77      1166

    accuracy                           0.72      3848
   macro avg       0.66      0.73      0.68      3848
weighted avg       0.74      0.72      0.72      3848

Test Score:  0.7216735966735967
Cross Validation Score:  0.7160865879468981


In [238]:
#The Scores are worse, but the overfitting is reduced so I will save this as the naive bayes model.
with open('naive_bayes.pickle', 'wb') as f:
    pickle.dump(lemma_model, f)

# Into More Advanced Models: Word2Vec

In [241]:
#Creating the correct text and target labels that Word2Vec is expecting
np.random.seed(0)
target = hurricanes['class_label']
data = hurricanes['model_text'].map(word_tokenize).values

In [242]:
#Taking a look at the unique tokens
total_vocabulary = set(word for tweet in data for word in tweet)
len(total_vocabulary)
print('There are {} unique tokens in the dataset.'.format(len(total_vocabulary)))

There are 28534 unique tokens in the dataset.


In [243]:
#Importing the glove corpus which Word2Vec is trained on
glove = {}
with open('glove.6B/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [234]:
#Turning Word2Vec into a class so that it can be used in a pipeline with models
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [164]:
#Creating pipelines using Word2Vec and differnt ml models
rf = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
               ('smt', SMOTE(random_state=30)),
              ('Random Forest', RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
                ('smt', SMOTE(random_state=30)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
               ('smt', SMOTE(random_state=30)),
              ('Logistic Regression', LogisticRegression())])

In [165]:
models = [('Random Forest', rf),
          ('Support Vector Machine', svc),
          ('Logistic Regression', lr)]

In [166]:
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of IT

In [167]:
#Not very impressive. Further tuning may be required
scores

[('Random Forest', 0.5876819126819126),
 ('Support Vector Machine', 0.5744282744282745),
 ('Logistic Regression', 0.5581081081081081)]

# RoBERTa Model

pip install simpletransformers

In [182]:
# Create a ClassificationModel using roberta
roberta = ClassificationModel('roberta', 'roberta-base', num_labels=4, use_cuda=False, weight=[1, 3, 1, 1],
                              args={'learning_rate':1e-5, 'num_train_epochs': 2, 'overwrite_output_dir': True})

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out

In [183]:
#Fixing the data to a format the model will accept
transformer_df = hurricanes[['tweet_text', 'label']]
transformer_df = transformer_df.rename(columns={"tweet_text" : "text"})
transformer_df = transformer_df.rename(columns={"label" : "labels"})

In [185]:
#Train test split on teh transformer ready data
train_sample, test_sample = train_test_split(transformer_df, random_state=42, train_size = .8)

In [187]:
#Training roberta on the sample data
roberta.train_model(train_sample)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15392.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 2'), FloatProgress(value=0.0, max=1924.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 1 of 2'), FloatProgress(value=0.0, max=1924.0), HTML(value='')))





(3848, 0.5614642921792588)

In [191]:
#evaluating the model on the test data
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    
result, model_outputs, wrong_predictions = roberta.eval_model(test_sample, f1=f1_multiclass, acc=accuracy_score)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3848.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=481.0), HTML(value='')))




In [193]:
#Impressive metrics
result

{'mcc': 0.7442880770167848,
 'f1': 0.8196465696465697,
 'acc': 0.8196465696465697,
 'eval_loss': 0.5939578148797774}

In [249]:
#The Scores are worse, but the overfitting is reduced so I will save this as the naive bayes model.
with open('roberta.pickle', 'wb') as f:
    pickle.dump(roberta, f)

danger    = 0
emergency = 1
other     = 2
relief    = 3