# ML Pipeline

### 1. Import libraries and load data from database.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table("disaster_messages", con=engine)

In [3]:
# Display df dataframe
df = pd.read_sql("SELECT * from disaster_messages",con=engine)

In [4]:
X = df['message']
y = df.iloc[:, 4:]

### 2. Write a tokenization function to process your text data

In [5]:
# Tokenization function
def tokenize(disaster_text):

    """
    Function to tokenize text.
    """

    tokens = word_tokenize(disaster_text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens=[]
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline

In [6]:
# Building an ML pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [7]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [8]:
# Fit the pipeline
pipeline.fit(X_train, y_train)



### 5. Test your model

In [9]:
# Make a prediction using the test set
y_pred = pipeline.predict(X_test)

In [10]:
# Testing the model
def test_model(y_test, y_pred):

    """
    Function to iterate through columns and call sklearn classification report on each.
    """
    for index, column in enumerate(y_test):
        print(column, classification_report(y_test[column], y_pred[:, index]))

In [11]:
# Testing the model using the test and the prediction
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.74      0.27      0.40      1529
           1       0.81      0.97      0.88      4975
           2       0.57      0.08      0.14        50

    accuracy                           0.80      6554
   macro avg       0.71      0.44      0.47      6554
weighted avg       0.79      0.80      0.76      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5403
           1       0.88      0.42      0.57      1151

    accuracy                           0.89      6554
   macro avg       0.88      0.70      0.75      6554
weighted avg       0.89      0.89      0.87      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6533
           1       0.00      0.00      0.00        21

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('clothing',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6475
           1       1.00      0.01      0.02        79

    accuracy                           0.99      6554
   macro avg       0.99      0.51      0.51      6554
weighted avg       0.99      0.99      0.98      6554

('money',)               precision    recall  f1-score   support

           0       0.98      1.00      0.99      6400
           1       0.83      0.03      0.06       154

    accuracy                           0.98      6554
   macro avg       0.91      0.52      0.53      6554
weighted avg       0.97      0.98      0.97      6554

('missing_people',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6485
           1       0.00      0.00      0.00        69

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('tools',)               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6519
           1       0.00      0.00      0.00        35

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      0.99      0.99      6554

('hospitals',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6486
           1       0.00      0.00      0.00        68

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      0.99      0.98      6554

('shops',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6523
           1       0.00      0.00      0.00        31

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00    

In [12]:
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.74      0.27      0.40      1529
           1       0.81      0.97      0.88      4975
           2       0.57      0.08      0.14        50

    accuracy                           0.80      6554
   macro avg       0.71      0.44      0.47      6554
weighted avg       0.79      0.80      0.76      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5403
           1       0.88      0.42      0.57      1151

    accuracy                           0.89      6554
   macro avg       0.88      0.70      0.75      6554
weighted avg       0.89      0.89      0.87      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6533
           1       0.00      0.00      0.00        21

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('money',)               precision    recall  f1-score   support

           0       0.98      1.00      0.99      6400
           1       0.83      0.03      0.06       154

    accuracy                           0.98      6554
   macro avg       0.91      0.52      0.53      6554
weighted avg       0.97      0.98      0.97      6554

('missing_people',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6485
           1       0.00      0.00      0.00        69

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      0.99      0.98      6554

('refugees',)               precision    recall  f1-score   support

           0       0.97      1.00      0.98      6348
           1       0.33      0.00      0.01       206

    accuracy                           0.97      6554
   macro avg       0.65      0.50      0.50      6554
weighted avg       0.95      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('buildings',)               precision    recall  f1-score   support

           0       0.96      1.00      0.98      6254
           1       0.81      0.10      0.17       300

    accuracy                           0.96      6554
   macro avg       0.88      0.55      0.58      6554
weighted avg       0.95      0.96      0.94      6554

('electricity',)               precision    recall  f1-score   support

           0       0.98      1.00      0.99      6430
           1       0.50      0.02      0.03       124

    accuracy                           0.98      6554
   macro avg       0.74      0.51      0.51      6554
weighted avg       0.97      0.98      0.97      6554

('tools',)               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6519
           1       0.00      0.00      0.00        35

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('floods',)               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6025
           1       0.91      0.36      0.51       529

    accuracy                           0.95      6554
   macro avg       0.93      0.68      0.74      6554
weighted avg       0.94      0.95      0.93      6554

('storm',)               precision    recall  f1-score   support

           0       0.95      0.99      0.97      5991
           1       0.77      0.46      0.58       563

    accuracy                           0.94      6554
   macro avg       0.86      0.72      0.77      6554
weighted avg       0.94      0.94      0.94      6554

('fire',)               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6497
           1       0.00      0.00      0.00        57

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.98      0.99      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Improve your model

In [13]:
# Using grid search to find better parameters
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x00000243BC7CEB90>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x00000243BC7CEB90>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(disaster_text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf'

In [14]:
# specifying parameters for grid search
parameters = {
    'clf__estimator__n_estimators' : [50, 150]
}

In [15]:
# creating grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

cv

In [16]:
# Fit the train datasets
cv.fit(X_train, y_train)



In [17]:
# Using cv to find the best parameter
cv.best_params_

{'clf__estimator__n_estimators': 150}

### 7. Test your model

In [18]:
# Performing predictions on the test dataset using cv
y_pred = cv.predict(X_test)

In [19]:
# Test the model using the test and prediction dataset
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.73      0.27      0.39      1529
           1       0.81      0.97      0.88      4975
           2       0.56      0.10      0.17        50

    accuracy                           0.80      6554
   macro avg       0.70      0.45      0.48      6554
weighted avg       0.79      0.80      0.76      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5403
           1       0.89      0.42      0.57      1151

    accuracy                           0.89      6554
   macro avg       0.89      0.70      0.75      6554
weighted avg       0.89      0.89      0.87      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6533
           1       0.00      0.00      0.00        21

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('search_and_rescue',)               precision    recall  f1-score   support

           0       0.97      1.00      0.99      6367
           1       1.00      0.03      0.05       187

    accuracy                           0.97      6554
   macro avg       0.99      0.51      0.52      6554
weighted avg       0.97      0.97      0.96      6554

('security',)               precision    recall  f1-score   support

           0       0.98      1.00      0.99      6439
           1       0.00      0.00      0.00       115

    accuracy                           0.98      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.97      0.98      0.97      6554

('military',)               precision    recall  f1-score   support

           0       0.97      1.00      0.98      6340
           1       1.00      0.02      0.04       214

    accuracy                           0.97      6554
   macro avg       0.98      0.51      0.51      6554
weighted avg       0.97

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('missing_people',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6485
           1       0.00      0.00      0.00        69

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      0.99      0.98      6554

('refugees',)               precision    recall  f1-score   support

           0       0.97      1.00      0.98      6348
           1       0.50      0.01      0.02       206

    accuracy                           0.97      6554
   macro avg       0.73      0.50      0.50      6554
weighted avg       0.95      0.97      0.95      6554

('death',)               precision    recall  f1-score   support

           0       0.96      1.00      0.98      6259
           1       0.85      0.12      0.21       295

    accuracy                           0.96      6554
   macro avg       0.91      0.56      0.59      6554
weighted avg       0.96      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('shops',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6523
           1       0.00      0.00      0.00        31

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

('aid_centers',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6482
           1       0.00      0.00      0.00        72

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      0.99      0.98      6554

('other_infrastructure',)               precision    recall  f1-score   support

           0       0.96      1.00      0.98      6293
           1       0.00      0.00      0.00       261

    accuracy                           0.96      6554
   macro avg       0.48      0.50      0.49      6554
weighted avg       0

In [20]:
# Displaying the accuracy score
accuracy = (y_pred == y_test).mean()
accuracy

('related',)                   0.799359
('request',)                   0.888770
('offer',)                     0.996796
('aid_related',)               0.766707
('medical_help',)              0.922338
('medical_products',)          0.953158
('search_and_rescue',)         0.972231
('security',)                  0.982301
('military',)                  0.967958
('child_alone',)               1.000000
('water',)                     0.949496
('food',)                      0.926305
('shelter',)                   0.933476
('clothing',)                  0.988251
('money',)                     0.977113
('missing_people',)            0.989472
('refugees',)                  0.968569
('death',)                     0.959414
('other_aid',)                 0.867409
('infrastructure_related',)    0.938816
('transport',)                 0.958346
('buildings',)                 0.957736
('electricity',)               0.981080
('tools',)                     0.994660
('hospitals',)                 0.989625


### 8. Try improving your model further.

In [21]:
from custom_transformer import StartingVerbExtractor
from sklearn import multioutput

#trying to add another feature.

upd_pipeline = Pipeline([
    ('features', FeatureUnion ([

        ('text_pipeline', Pipeline ([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
        ('starting_verb', StartingVerbExtractor ())
    ])),

    ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier ()))
])

# train SVM classifier
upd_pipeline.fit(X_train, y_train)


[nltk_data] Downloading package punkt to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Just Me\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [22]:
y_pred_upd = upd_pipeline.predict (X_test)
#converting to dataframe
y_pred_upd = pd.DataFrame (y_pred_upd, columns = y_test.columns)

### 9. Export your model as a pickle file

In [23]:
filename = 'classifier.pkl'
pickle.dump(cv, open(filename, 'wb'))