In [1]:
import pandas as pd
import numpy as np
import spacy
import re

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
from clean_text import clean_text

## Inital DataFrame
DataFrame of Tweets from 4 hurricanes; training datasets

In [4]:
df = pd.read_csv('full_text_only.csv')
df.shape

(19208, 3)

In [5]:
df.head(3)

Unnamed: 0,tweet_id,tweet_text,class_label
0,783409770493571076,Horrifying. My heart breaks for Haiti. Thinkin...,sympathy_and_support
1,783683862018818049,Our thoughts and prayers are with those suffer...,sympathy_and_support
2,784696725285908481,#BreakingNews Hurricane Matthew kills over 800...,injured_or_dead_people


### Establish y labels 

In [6]:
labels = df['class_label']

In [7]:
values = labels.unique()
values

array(['sympathy_and_support', 'injured_or_dead_people',
       'other_relevant_information',
       'rescue_volunteering_or_donation_effort',
       'infrastructure_and_utility_damage', 'not_humanitarian',
       'requests_or_urgent_needs', 'caution_and_advice',
       'displaced_people_and_evacuations'], dtype=object)

In [8]:
value_obj = {v:i for i, v in enumerate(values)}

In [9]:
y = [value_obj[label] for label in labels]

### Establish dataset X

In [10]:
%%time
X_text = clean_text(df['tweet_text'])

Completed Text with 19208 vectors.
CPU times: user 32.1 s, sys: 172 ms, total: 32.3 s
Wall time: 32.3 s


## Count Vectorize Text

In [11]:
vectorizer = CountVectorizer()

In [12]:
%%time
X = vectorizer.fit_transform(X_text)

CPU times: user 220 ms, sys: 16 µs, total: 220 ms
Wall time: 220 ms


In [13]:
X.shape

(19208, 23102)

## Product Train, Test split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Train and model Multinomial Naive Bayes model

In [15]:
clf = MultinomialNB()

In [16]:
clf.fit(X_train, y_train)

In [17]:
pred = clf.predict(X_test)

In [18]:
# clf.score(pred, y_test)

In [19]:
pred.shape

(4802,)

In [20]:
model_1 = accuracy_score(y_test, pred)

In [21]:
model_1

0.6432736359850062

## Establish prior counts

#### This is wrong; we have data leakage here....

In [22]:
p_df = df.groupby('class_label').count()

In [23]:
p_df = p_df.reset_index()
p_df

Unnamed: 0,class_label,tweet_id,tweet_text
0,caution_and_advice,987,987
1,displaced_people_and_evacuations,1129,1129
2,infrastructure_and_utility_damage,3293,3293
3,injured_or_dead_people,1482,1482
4,not_humanitarian,959,959
5,other_relevant_information,4214,4214
6,requests_or_urgent_needs,856,856
7,rescue_volunteering_or_donation_effort,4701,4701
8,sympathy_and_support,1587,1587


In [24]:
p_df['values'] = p_df.apply(lambda x: value_obj[x['class_label']], axis = 1)

In [25]:
p_df = p_df.sort_values('values')
p_df

Unnamed: 0,class_label,tweet_id,tweet_text,values
8,sympathy_and_support,1587,1587,0
3,injured_or_dead_people,1482,1482,1
5,other_relevant_information,4214,4214,2
7,rescue_volunteering_or_donation_effort,4701,4701,3
2,infrastructure_and_utility_damage,3293,3293,4
4,not_humanitarian,959,959,5
6,requests_or_urgent_needs,856,856,6
0,caution_and_advice,987,987,7
1,displaced_people_and_evacuations,1129,1129,8


In [26]:
p_df['proportion'] = p_df.apply(lambda x: x['tweet_id']/19208, axis = 1)

In [27]:
p_df

Unnamed: 0,class_label,tweet_id,tweet_text,values,proportion
8,sympathy_and_support,1587,1587,0,0.082622
3,injured_or_dead_people,1482,1482,1,0.077155
5,other_relevant_information,4214,4214,2,0.219388
7,rescue_volunteering_or_donation_effort,4701,4701,3,0.244742
2,infrastructure_and_utility_damage,3293,3293,4,0.171439
4,not_humanitarian,959,959,5,0.049927
6,requests_or_urgent_needs,856,856,6,0.044565
0,caution_and_advice,987,987,7,0.051385
1,displaced_people_and_evacuations,1129,1129,8,0.058778


In [28]:
class_prior = list(p_df['proportion'])

## Multinomial Naive Bayes with priors

In [29]:
clf2 = MultinomialNB(class_prior=class_prior)

In [30]:
clf2.fit(X_train, y_train)

In [31]:
pred2 = clf2.predict(X_test)

In [32]:
model_2 = accuracy_score(y_test, pred2)
model_2

0.6430653894210746

In [33]:
model_1, model_2

(0.6432736359850062, 0.6430653894210746)