In [27]:
import pickle
import re
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# load data from database
engine = create_engine('sqlite:///{}'.format('../data/disaster_response.db'))
df = pd.read_sql('messages', engine)
X = df.iloc[:, 1].values
y = df.iloc[:, 3:].values

In [19]:
df.head()

Unnamed: 0,id,message,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,direct,1,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Tokenization function to process text data

In [20]:
def tokenize(text):
    # Remove special characters
    no_punct = re.sub('[^A-Za-z0-9]+', ' ', text)
    # Tokenize and change text to lowercase
    tokens = word_tokenize(no_punct.lower())
    # Remove stop words if exist
    no_stop_words = [word for word in tokens if word not in stopwords.words('english')]
    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in no_stop_words]
    lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]
    return lemmed

### Machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 35 categories in the dataset.

In [7]:
pipeline = Pipeline([('count', CountVectorizer(tokenizer=tokenize)),
                    ('tfid', TfidfTransformer()),
                    ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators = 50, random_state=0)))])

In [22]:
# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('count', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
           n_jobs=1))])

### Testing model
Report the f1 score, precision and recall for each output category of the dataset

In [9]:
y_pred = pipeline.predict(X_test)

In [10]:
for i in range(0, 35):
    name = df.iloc[:, 3:].columns
    print(name[i])
    print(classification_report(y_test[:,i], y_pred[:,i]))

related
             precision    recall  f1-score   support

          0       0.69      0.42      0.53      1270
          1       0.84      0.94      0.89      3974

avg / total       0.80      0.82      0.80      5244

request
             precision    recall  f1-score   support

          0       0.90      0.98      0.94      4379
          1       0.81      0.46      0.58       865

avg / total       0.89      0.89      0.88      5244

offer
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5218
          1       0.00      0.00      0.00        26

avg / total       0.99      1.00      0.99      5244

aid_related
             precision    recall  f1-score   support

          0       0.78      0.85      0.82      3075
          1       0.76      0.67      0.71      2169

avg / total       0.77      0.78      0.77      5244

medical_help
             precision    recall  f1-score   support

          0       0.92      1.00      0

  'precision', 'predicted', average, warn_for)


### Improving model using gridsearch

In [11]:
parameters = {'clf__estimator__max_features':['auto', 'sqrt', 'log2'],
             'clf__estimator__n_estimators':[50, 60, 75]}

cv = GridSearchCV(estimator=pipeline, param_grid=parameters)

#### The following cell took a couple of hours to run so to avoid waiting, I ran it once and took the output to know which parameters were the best so I can make a newer pipeline with those parameters.

In [None]:
# cv.fit(X_train, y_train)
# y_pred = cv.predict(X_test)

#### The result parameters were:
#### {'clf__estimator__max_features': 'auto, 'clf__estimator__n_estimators': 75}

### Make the final improved pipeline

In [23]:
pipeline_improved = Pipeline([('count', CountVectorizer(tokenizer=tokenize)),
                    ('tfid', TfidfTransformer()),
                    ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators = 75, max_features = 'auto', random_state=0)))])

pipeline_improved.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('count',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize a...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

In [24]:
y_pred = pipeline_improved.predict(X_test)

In [25]:
for i in range(0, 35):
    name = df.iloc[:, 3:].columns
    print(name[i])
    print(classification_report(y_test[:,i], y_pred[:,i]))

related
              precision    recall  f1-score   support

           0       0.70      0.40      0.51      1203
           1       0.84      0.95      0.89      4002
           2       0.37      0.41      0.39        39

    accuracy                           0.82      5244
   macro avg       0.64      0.59      0.60      5244
weighted avg       0.81      0.82      0.80      5244

request
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      4350
           1       0.84      0.52      0.64       894

    accuracy                           0.90      5244
   macro avg       0.87      0.75      0.79      5244
weighted avg       0.90      0.90      0.89      5244

offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5225
           1       0.00      0.00      0.00        19

    accuracy                           1.00      5244
   macro avg       0.50      0.50      0.50      524

  _warn_prf(average, modifier, msg_start, len(result))



money
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5118
           1       1.00      0.04      0.08       126

    accuracy                           0.98      5244
   macro avg       0.99      0.52      0.53      5244
weighted avg       0.98      0.98      0.97      5244

missing_people
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5175
           1       0.00      0.00      0.00        69

    accuracy                           0.99      5244
   macro avg       0.49      0.50      0.50      5244
weighted avg       0.97      0.99      0.98      5244

refugees
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5055
           1       0.33      0.01      0.02       189

    accuracy                           0.96      5244
   macro avg       0.65      0.50      0.50      5244
weighted avg       0.94      0.96      0.95

### Export your model as a pickle file

In [28]:
pickle.dump(pipeline_improved, open('classifier.pkl', "wb"))