In [1]:
import pickle
with open('corpus0.pkl','rb') as corpus_0:
    corpus0 = pickle.load(corpus_0)
    
with open('corpus1.pkl','rb') as corpus_1:
    corpus1 = pickle.load(corpus_1)
    
with open('train_data.pkl','rb') as training:
    train = pickle.load(training)

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

plt.style.use(style='seaborn')
%matplotlib inline

In [3]:
#Further cleaning
#Removing Links and underscores
#The text has been stemmed previously
train['text'] = train['text'].str.replace('http\S+',"")
train['text'] = train['text'].str.replace('_' ,' ')

In [4]:
train['text']

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3             peopl receiv wildfir evacu order california
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608       two giant crane hold bridg collaps nearbi home
7609    aria ahrari thetawniest control wild fire cali...
7610                                       volcano hawaii
7611    polic investig ebik collid car littl portug eb...
7612    latest home raze northern california wildfir a...
Name: text, Length: 7613, dtype: object

In [5]:
#Tokenizing Text again to account for new cleanup
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

train["tokens"] = train["text"].map(tokenizer.tokenize)

In [6]:
train.head()

Unnamed: 0,id,keyword,location,text,target,tokens,words_count,text_len,unique_word_count,mean_word_length
0,1,-99,-99,deed reason earthquak may allah forgiv us,1,"[deed, reason, earthquak, may, allah, forgiv, us]",7,44,7,5.428571
1,4,-99,-99,forest fire near la rong sask canada,1,"[forest, fire, near, la, rong, sask, canada]",7,37,7,4.428571
2,5,-99,-99,resid ask shelter place notifi offic evacu she...,1,"[resid, ask, shelter, place, notifi, offic, ev...",11,88,9,7.090909
3,6,-99,-99,peopl receiv wildfir evacu order california,1,"[peopl, receiv, wildfir, evacu, order, califor...",6,54,6,8.0
4,7,-99,-99,got sent photo rubi alaska smoke wildfir pour ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi...",9,55,9,5.222222


In [7]:
train.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7603,7604,7605,7606,7607,7608,7609,7610,7611,7612
id,1,4,5,6,7,8,10,13,14,15,...,10862,10863,10864,10866,10867,10869,10870,10871,10872,10873
keyword,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
location,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
text,deed reason earthquak may allah forgiv us,forest fire near la rong sask canada,resid ask shelter place notifi offic evacu she...,peopl receiv wildfir evacu order california,got sent photo rubi alaska smoke wildfir pour ...,rockyfir updat california hwi close direct due...,flood disast heavi rain caus flash flood stree...,im top hill see fire wood,there emerg evacu happen build across street,im afraid tornado come area,...,offici say quarantin place alabama home possib...,worldnew fallen powerlin glink tram updat fire...,flip side im walmart bomb everyon evacu stay t...,suicid bomber kill saudi secur site mosqu reut...,stormchas violent record break el reno oklahom...,two giant crane hold bridg collaps nearbi home,aria ahrari thetawniest control wild fire cali...,volcano hawaii,polic investig ebik collid car littl portug eb...,latest home raze northern california wildfir a...
target,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


#### Count Vectorizer

def generate_ngrams(s, n):
    
    stop = set(stopwords.words("english")) if stop else {}
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

#Unigrams
vectorizer = CountVectorizer()
train_countvec = vectorizer.fit_transform(train['text'])
data_table = pd.DataFrame(train_countvec.toarray(),columns= vectorizer.get_feature_names())
data_table

Unnamed: 0,aa,aaaa,aaaaaaallll,aaaaaand,aaarrrgghhh,aace,aamir,aampb,aampw,aan,...,ûïwhen,ûïymcglaun,ûïyou,ûò,ûòthe,ûòåêcnbc,ûó,ûóher,ûókodi,ûûif
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print(vectorizer.get_feature_names()[0:11])

['aa', 'aaaa', 'aaaaaaallll', 'aaaaaand', 'aaarrrgghhh', 'aace', 'aamir', 'aampb', 'aampw', 'aan', 'aannnnd']


In [10]:
#Unigrams
print(vectorizer.vocabulary_)



In [11]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
train_countvec2 = vectorizer2.fit_transform(train['text'])
data_table1 = pd.DataFrame(train_countvec2.toarray(),columns= vectorizer2.get_feature_names())
data_table1

Unnamed: 0,aa ayyo,aa batteri,aaaa ok,aaaaaaallll iûªm,aaaaaand there,aace org,aamir javaid,aampb pipelin,aampw help,aan den,...,ûó offici,ûó oh,ûó organ,ûó richmond,ûó rt,ûó stori,ûó wallybait,ûóher upper,ûókodi vine,ûûif lost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#Bigrams
print(vectorizer2.vocabulary_)



In [13]:
vectorizer3 = CountVectorizer(analyzer='word', ngram_range=(1, 2))
train_countVec = vectorizer3.fit_transform(train['text'])
count_table = pd.DataFrame(train_countVec.toarray(),columns= vectorizer3.get_feature_names())
count_table

Unnamed: 0,aa,aa ayyo,aa batteri,aaaa,aaaa ok,aaaaaaallll,aaaaaaallll iûªm,aaaaaand,aaaaaand there,aaarrrgghhh,...,ûó richmond,ûó rt,ûó stori,ûó wallybait,ûóher,ûóher upper,ûókodi,ûókodi vine,ûûif,ûûif lost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
print(vectorizer3.vocabulary_)



In [15]:
test = pd.read_csv('test.csv')

In [16]:
test_countVec = vectorizer3.transform(test['text'])
count_test = pd.DataFrame(test_countVec.toarray(),columns= vectorizer3.get_feature_names())
count_test

Unnamed: 0,aa,aa ayyo,aa batteri,aaaa,aaaa ok,aaaaaaallll,aaaaaaallll iûªm,aaaaaand,aaaaaand there,aaarrrgghhh,...,ûó richmond,ûó rt,ûó stori,ûó wallybait,ûóher,ûóher upper,ûókodi,ûókodi vine,ûûif,ûûif lost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [18]:
X = train_countVec
y = train["target"].values

X_train_count, X_test, y_train_count, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [19]:
model = LogisticRegression(class_weight="balanced")
model.fit(X_train_count, y_train_count)

LogisticRegression(class_weight='balanced')

In [20]:
y_pred = model.predict(X_test)

f1score = f1_score(y_test, y_pred)
print(f"Counts Model Score: {f1score * 100} %")

Counts Model Score: 74.77982385908726 %


In [21]:
train

Unnamed: 0,id,keyword,location,text,target,tokens,words_count,text_len,unique_word_count,mean_word_length
0,1,-99,-99,deed reason earthquak may allah forgiv us,1,"[deed, reason, earthquak, may, allah, forgiv, us]",7,44,7,5.428571
1,4,-99,-99,forest fire near la rong sask canada,1,"[forest, fire, near, la, rong, sask, canada]",7,37,7,4.428571
2,5,-99,-99,resid ask shelter place notifi offic evacu she...,1,"[resid, ask, shelter, place, notifi, offic, ev...",11,88,9,7.090909
3,6,-99,-99,peopl receiv wildfir evacu order california,1,"[peopl, receiv, wildfir, evacu, order, califor...",6,54,6,8.000000
4,7,-99,-99,got sent photo rubi alaska smoke wildfir pour ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi...",9,55,9,5.222222
...,...,...,...,...,...,...,...,...,...,...
7608,10869,-99,-99,two giant crane hold bridg collaps nearbi home,1,"[two, giant, crane, hold, bridg, collaps, near...",8,54,8,5.750000
7609,10870,-99,-99,aria ahrari thetawniest control wild fire cali...,1,"[aria, ahrari, thetawniest, control, wild, fir...",11,88,11,7.090909
7610,10871,-99,-99,volcano hawaii,1,"[volcano, hawaii]",2,18,2,6.500000
7611,10872,-99,-99,polic investig ebik collid car littl portug eb...,1,"[polic, investig, ebik, collid, car, littl, po...",14,113,13,7.142857


In [22]:
#Replace the former train_data pickle with the new one
import pickle
with open('train_data.pkl','wb') as f:
    pickle.dump(train,f)

#### RidgeClassifier

In [25]:
from sklearn import linear_model, preprocessing, model_selection

In [26]:
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, train_countVec, train["target"], cv=3, scoring="f1")
scores

array([0.59785674, 0.53204093, 0.59334764])

In [28]:
test_vectors = vectorizer3.transform(test["text"])


In [29]:
clf.fit(train_countVec, train["target"])

RidgeClassifier()

In [30]:
y_prediction = clf.predict(X_test)

f1score = f1_score(y_test, y_prediction)
print(f"Counts Model Score: {f1score * 100} %")

Counts Model Score: 97.96557120500782 %


In [31]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)

In [32]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [33]:
sample_submission.to_csv("output.csv", index=False)

In [34]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = model.predict(test_vectors)

In [35]:
sample_submission.to_csv("output-1.csv", index=False)