# Data Cleaning

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import re
import string
import numpy as np
import pandas as pd

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv('../raw_data/train.csv')
cleaned_df = df.drop(columns=['location', 'keyword'])

In [None]:
cleaned_df.head()

In [None]:
def remove_punctuation(text):
    for p in string.punctuation:
        text = text.replace(p, '') 
    return text

In [None]:
def remove_digit(text):
    return ''.join(c for c in text if not c.isdigit())

In [None]:
def expand_words(text):
    expansion_dict = {
        "ain't": "are not", 
        "'s": " is", 
        "aren't": "are not", 
        "don't": "do not",
        "didn't": "did not", 
        "won't": "will not",
        "can't": "cannot"
    }
    for word, exp in expansion_dict.items():
        text = text.replace(word, exp)
    return text

In [None]:
def remove_stopwords(text, language='english'):
    stop_words = set(stopwords.words(language))
    return [w for w in word_tokenize(text) if not w in stop_words]

In [None]:
def lemmatize_text(text):
    lemmatizer  = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text])

In [None]:
def remove_repeated_char(words):
    for w in words:
        if re.search(r'(.)\1{2}', w):
            words.remove(w)
    return words

In [None]:
cleaned_df['text'] = cleaned_df['text'].apply(lambda text: re.sub(r'http\S+', '', text))
cleaned_df['text'] = cleaned_df['text'].apply(remove_punctuation)
cleaned_df['text'] = cleaned_df['text'].apply(remove_digit)
cleaned_df['text'] = cleaned_df['text'].apply(lambda text: text.lower())
cleaned_df['text'] = cleaned_df['text'].apply(expand_words)
cleaned_df['text'] = cleaned_df['text'].apply(lambda text: text.strip())

cleaned_df['text'] = cleaned_df['text'].apply(remove_stopwords)
cleaned_df['text'] = cleaned_df['text'].apply(remove_repeated_char)
cleaned_df['text'] = cleaned_df['text'].apply(lemmatize_text)

cleaned_df.head(20)

# MultinomialNB Model

In [221]:
from NLP_Natural_Disasters.data import get_data, clean_data

In [222]:
cleaned_df = clean_data(get_data())
cleaned_df.shape

(7613, 3)

In [223]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate, train_test_split

In [224]:
vectorizer = TfidfVectorizer(max_features=3000, min_df=0.0007, max_df=0.8, ngram_range=(1,2))
X = vectorizer.fit_transform(cleaned_df['text'])
y = cleaned_df['target']

In [225]:
vectorizer.get_feature_names()



['aba',
 'aba woman',
 'abandoned',
 'abandoned aircraft',
 'abc',
 'abc news',
 'abcnews',
 'ablaze',
 'able',
 'absolutely',
 'abstorm',
 'abuse',
 'access',
 'accident',
 'accident expert',
 'accident man',
 'according',
 'account',
 'acre',
 'across',
 'act',
 'action',
 'action year',
 'activity',
 'actually',
 'add',
 'added',
 'added video',
 'address',
 'admits',
 'admits arson',
 'adult',
 'advance',
 'advisory',
 'af',
 'affected',
 'affected fatal',
 'afghan',
 'afghanistan',
 'africa',
 'afternoon',
 'aftershock',
 'aftershock djicemoon',
 'age',
 'ago',
 'agree',
 'ah',
 'ahead',
 'aid',
 'aim',
 'aint',
 'air',
 'air accident',
 'air ambulance',
 'aircraft',
 'aircraft debris',
 'airline',
 'airplane',
 'airplane accident',
 'airplane debris',
 'airport',
 'airport get',
 'aka',
 'al',
 'alabama',
 'alabama home',
 'alarm',
 'alaska',
 'album',
 'alert',
 'alive',
 'allah',
 'alleged',
 'allow',
 'allows',
 'allows parole',
 'almost',
 'alone',
 'along',
 'already',
 'als

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [227]:
nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train, y_train)

MultinomialNB(alpha=0.1)

In [228]:
cv_score = cross_validate(nb_model, X, y, cv=5, scoring=['f1', 'accuracy'])
print('F1 Score:', cv_score['test_f1'].mean())
print('F1 Score:', cv_score['test_accuracy'].mean())

F1 Score: 0.6366071824456155
F1 Score: 0.7009152694169041


In [210]:
y_pred = nb_model.predict(X_test)

In [218]:
tmp = [
    "The house is fine",
    "OH NO AN EARTHQUAKE",
    "Rengoku was ablaze at the end",
    "This is a nice little fire"
]

tmp_vec = vectorizer.transform(tmp)
nb_model.predict(tmp_vec)

array([0, 1, 0, 0])

# GRID SEARCH

In [44]:
from NLP_Natural_Disasters.data import get_data, clean_data

In [57]:
cleaned_df = clean_data(get_data())
cleaned_df.shape

(7613, 3)

In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [59]:
X = cleaned_df['text']
y = cleaned_df['target']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [61]:
pipeline = Pipeline([
    ('vec', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

parameters = {
    'vec__ngram_range': ((1,1), (1,2), (2,2), (1,3), (2,3), (3,3)),
    'vec__min_df': (0.0007, 0.0008, 0.0009, 0.001),
    'vec__max_df': (0.8, 0.9, 0.95, 0.99, 1.0),
    'vec__max_features': (1000, 1500, 2000, 2500, 3000),
    'nb__alpha': (0.1,1,10),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "f1",
                           refit=True, cv=5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1, 10),
                         'vec__max_df': (0.8, 0.9, 0.95, 0.99, 1.0),
                         'vec__max_features': (1000, 1500, 2000, 2500, 3000),
                         'vec__min_df': (0.0007, 0.0008, 0.0009, 0.001),
                         'vec__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3),
                                              (2, 3), (3, 3))},
             scoring='f1', verbose=1)

In [62]:
grid_search.best_params_

{'nb__alpha': 0.1,
 'vec__max_df': 0.8,
 'vec__max_features': 3000,
 'vec__min_df': 0.0007,
 'vec__ngram_range': (1, 2)}

In [130]:
grid_search.best_score_

0.7424518478861719