# Data Cleaning

In [132]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [133]:
import re
import string
import numpy as np
import pandas as pd

In [134]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/axelc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/axelc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/axelc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/axelc/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [135]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [136]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [137]:
pd.set_option('display.max_colwidth', None)

In [138]:
df = pd.read_csv('../raw_data/train.csv')
cleaned_df = df.drop(columns=['location', 'keyword'])

In [139]:
cleaned_df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,"13,000 people receive #wildfires evacuation orders in California",1
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [140]:
def remove_punctuation(text):
    for p in string.punctuation:
        text = text.replace(p, '') 
    return text

In [141]:
def remove_digit(text):
    return ''.join(c for c in text if not c.isdigit())

In [142]:
def expand_words(text):
    expansion_dict = {
        "ain't": "are not", 
        "'s": " is", 
        "aren't": "are not", 
        "don't": "do not",
        "didn't": "did not", 
        "won't": "will not",
        "can't": "cannot"
    }
    for word, exp in expansion_dict.items():
        text = text.replace(word, exp)
    return text

In [143]:
def remove_stopwords(text, language='english'):
    stop_words = set(stopwords.words(language))
    return [w for w in word_tokenize(text) if not w in stop_words]

In [144]:
def lemmatize_text(text):
    lemmatizer  = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text])

In [145]:
cleaned_df['text'] = cleaned_df['text'].apply(lambda text: re.sub(r'http\S+', '', text))
cleaned_df['text'] = cleaned_df['text'].apply(remove_punctuation)
cleaned_df['text'] = cleaned_df['text'].apply(remove_digit)
cleaned_df['text'] = cleaned_df['text'].apply(lambda text: text.lower())
cleaned_df['text'] = cleaned_df['text'].apply(expand_words)
cleaned_df['text'] = cleaned_df['text'].apply(lambda text: text.strip())

cleaned_df['text'] = cleaned_df['text'].apply(remove_stopwords)
cleaned_df['text'] = cleaned_df['text'].apply(lemmatize_text)

cleaned_df.head(20)

Unnamed: 0,id,text,target
0,1,deed reason earthquake may allah forgive u,1
1,4,forest fire near la ronge sask canada,1
2,5,resident asked shelter place notified officer evacuation shelter place order expected,1
3,6,people receive wildfire evacuation order california,1
4,7,got sent photo ruby alaska smoke wildfire pours school,1
5,8,rockyfire update california hwy closed direction due lake county fire cafire wildfire,1
6,10,flood disaster heavy rain cause flash flooding street manitou colorado spring area,1
7,13,im top hill see fire wood,1
8,14,there emergency evacuation happening building across street,1
9,15,im afraid tornado coming area,1


# MultinomialNB Model

In [146]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate, train_test_split

In [147]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X = vectorizer.fit_transform(cleaned_df['text'])
y = cleaned_df['target']

Note : 
- Might need to remove @username
- Look at spellcheck
- Look at abreviation checker

In [148]:
vectorizer.get_feature_names()



['aa',
 'aaaa',
 'aaaaaaallll',
 'aaaaaand',
 'aaarrrgghhh',
 'aaceorg',
 'aal',
 'aampb',
 'aampw',
 'aan',
 'aannnnd',
 'aar',
 'aaronthefm',
 'aashiqui',
 'ab',
 'aba',
 'abandon',
 'abandoned',
 'abandonedpics',
 'abandoning',
 'abbandoned',
 'abbott',
 'abbruchsimulator',
 'abbswinston',
 'abbyairshow',
 'abc',
 'abcchicago',
 'abceyewitness',
 'abcnews',
 'abcnorio',
 'abe',
 'aberdeen',
 'aberdeenfanpage',
 'aberdeenfc',
 'aberystwythshrewsbury',
 'abes',
 'abha',
 'abia',
 'ability',
 'abject',
 'ablaze',
 'able',
 'ableg',
 'abninfvet',
 'aboard',
 'abomb',
 'abombed',
 'abomination',
 'abortion',
 'abouts',
 'abrancaballero',
 'absence',
 'absolute',
 'absolutely',
 'absolutsumya',
 'abstorm',
 'abstract',
 'absurd',
 'absurdly',
 'abubaraa',
 'abuse',
 'abused',
 'abuseddesolateamplost',
 'abusing',
 'abysmaljoiner',
 'ac',
 'acaciapenn',
 'academia',
 'acarewornheart',
 'acc',
 'accept',
 'accepte',
 'accepts',
 'access',
 'accident',
 'accidentally',
 'accidentalprophecy',

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [150]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

MultinomialNB()

In [151]:
cv_score = cross_validate(nb_model, X, y, cv=5, scoring=['f1', 'accuracy'])
print('F1 Score:', cv_score['test_f1'].mean())
print('Accuracy Score:', cv_score['test_accuracy'].mean())

F1 Score: 0.6500120499210136
Accuracy Score: 0.7290239110683924


In [152]:
y_pred = nb_model.predict(X_test)

In [153]:
tmp = [
    "The house is fine",
    "OH NO AN EARTHQUAKE",
    "Rengoku was ablaze at the end"
]

tmp_vec = vectorizer.transform(tmp)
nb_model.predict(tmp_vec)

array([1, 1, 1])

# GRID SEARCH

In [154]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [155]:
X = cleaned_df['text']
y = cleaned_df['target']

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [186]:
pipeline = Pipeline([
    ('vec', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

parameters = {
    'vec__ngram_range': ((1,1), (1,2), (2,2), (1,3), (2,3), (3,3)),
    'vec__min_df': (1, 5, 10),
    'nb__alpha': (0.1,1,10),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "f1", 
                           refit=True, cv=5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1, 10), 'vec__min_df': (1, 5, 10),
                         'vec__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3),
                                              (2, 3), (3, 3))},
             scoring='f1', verbose=1)

In [187]:
grid_search.best_params_

{'nb__alpha': 0.1, 'vec__min_df': 1, 'vec__ngram_range': (1, 2)}

In [188]:
grid_search.best_score_

0.7314213862718649