In [166]:
import re
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords', 'omw-1.4'])
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/jwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jwang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/jwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/jwang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [51]:
# load data
category_df = pd.read_csv('disaster_categories.csv')
message_df = pd.read_csv('disaster_messages.csv')

In [100]:
# Merge data
merge_df = category_df.merge(message_df, how = 'inner', on = 'id')
merge_df.head()

Unnamed: 0,id,categories,message,original,genre
0,2,related-1;request-0;offer-0;aid_related-0;medi...,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,related-1;request-0;offer-0;aid_related-1;medi...,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,related-1;request-0;offer-0;aid_related-0;medi...,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,related-1;request-1;offer-0;aid_related-1;medi...,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,related-1;request-0;offer-0;aid_related-0;medi...,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [101]:
# Duplicate data
merge_df = merge_df.drop_duplicates()
merge_df.shape

(26216, 5)

In [176]:
# Drop column which is not needed for modeling and has a bunch of missing values
df = merge_df.drop(['original', 'genre'], axis=1)
df.head()

Unnamed: 0,id,categories,message
0,2,related-1;request-0;offer-0;aid_related-0;medi...,Weather update - a cold front from Cuba that c...
1,7,related-1;request-0;offer-0;aid_related-1;medi...,Is the Hurricane over or is it not over
2,8,related-1;request-0;offer-0;aid_related-0;medi...,Looking for someone but no name
3,9,related-1;request-1;offer-0;aid_related-1;medi...,UN reports Leogane 80-90 destroyed. Only Hospi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...,"says: west side of Haiti, rest of the country ..."


In [145]:
# Normalization and Tokenization  
df.message = [word_tokenize(re.sub(r"[^a-zA-Z0-9]", " ", text).lower()) for text in df.message]
df.categories = [word_tokenize(re.sub(r"[^a-zA-Z0-9]", " ", text).lower()) for text in df.categories]

In [146]:
# Stop words and Lemmatization
def lemmatize_stopwords_text(text):
    return [WordNetLemmatizer().lemmatize(w, pos='v').strip() for w in text if w not in stopwords.words("english")]

df.categories = df.categories.apply(lemmatize_stopwords_text)
df.message = df.message.apply(lemmatize_stopwords_text)

In [195]:
# Bag of words and TF-IDF
vectorizer = TfidfVectorizer(stop_words={'english'}, token_pattern=r"[^a-zA-Z0-9]")
df_1 = vectorizer.fit_transform(df.message.loc[0:10]) 
# df['categories_tfidf'] = [transformer.fit_transform(vect.fit_transform(txt)) for txt in df.categories]
df_1.toarray()

array([[0.98800615, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.15441455, 0.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.84058946, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.13137494, 0.52549975, 0.        ],
       [0.95630012, 0.        , 0.        , 0.        , 0.        ,
        0.16304645, 0.        , 0.        , 0.24270545],
       [0.90541816, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.42452086, 0.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.90918147, 0.23074691, 0.        , 0.        , 0.        ,
        0.31002571, 0.        , 0.15501286, 0.        ],


In [162]:
df.message.loc[0]

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pass', 'haiti']