# ML Pipeline

### 1. Import libraries and load data from database.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table("disaster_messages", con=engine)

In [5]:
# Display df dataframe
df = pd.read_sql("SELECT * from disaster_messages",con=engine)

In [7]:
X = df['message']
y = df.iloc[:, 4:]

### 2. Write a tokenization function to process your text data

In [8]:
# Tokenization function
def tokenize(disaster_text):

    """
    Function to tokenize text.
    """

    tokens = word_tokenize(disaster_text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens=[]
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline

In [9]:
# Building an ML pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [10]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [11]:
# Fit the pipeline
pipeline.fit(X_train, y_train)



### 5. Test your model

In [12]:
# Make a prediction using the test set
y_pred = pipeline.predict(X_test)

In [13]:
# Testing the model
def test_model(y_test, y_pred):

    """
    Function to iterate through columns and call sklearn classification report on each.
    """
    for index, column in enumerate(y_test):
        print(column, classification_report(y_test[column], y_pred[:, index]))

In [14]:
# Testing the model using the test and the prediction
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.74      0.27      0.39      1482
           1       0.81      0.97      0.88      5019
           2       0.83      0.09      0.17        53

    accuracy                           0.81      6554
   macro avg       0.80      0.44      0.48      6554
weighted avg       0.80      0.81      0.77      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5408
           1       0.89      0.44      0.59      1146

    accuracy                           0.89      6554
   macro avg       0.89      0.71      0.76      6554
weighted avg       0.89      0.89      0.88      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('food',)               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5812
           1       0.91      0.40      0.56       742

    accuracy                           0.93      6554
   macro avg       0.92      0.70      0.76      6554
weighted avg       0.93      0.93      0.91      6554

('shelter',)               precision    recall  f1-score   support

           0       0.93      1.00      0.96      5983
           1       0.91      0.21      0.34       571

    accuracy                           0.93      6554
   macro avg       0.92      0.60      0.65      6554
weighted avg       0.93      0.93      0.91      6554

('clothing',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6451
           1       0.83      0.05      0.09       103

    accuracy                           0.98      6554
   macro avg       0.91      0.52      0.54      6554
weighted avg       0.98      0.98    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.74      0.27      0.39      1482
           1       0.81      0.97      0.88      5019
           2       0.83      0.09      0.17        53

    accuracy                           0.81      6554
   macro avg       0.80      0.44      0.48      6554
weighted avg       0.80      0.81      0.77      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5408
           1       0.89      0.44      0.59      1146

    accuracy                           0.89      6554
   macro avg       0.89      0.71      0.76      6554
weighted avg       0.89      0.89      0.88      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('shelter',)               precision    recall  f1-score   support

           0       0.93      1.00      0.96      5983
           1       0.91      0.21      0.34       571

    accuracy                           0.93      6554
   macro avg       0.92      0.60      0.65      6554
weighted avg       0.93      0.93      0.91      6554

('clothing',)               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6451
           1       0.83      0.05      0.09       103

    accuracy                           0.98      6554
   macro avg       0.91      0.52      0.54      6554
weighted avg       0.98      0.98      0.98      6554

('money',)               precision    recall  f1-score   support

           0       0.98      1.00      0.99      6402
           1       0.88      0.05      0.09       152

    accuracy                           0.98      6554
   macro avg       0.93      0.52      0.54      6554
weighted avg       0.98      0.98   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Improve your model

In [16]:
# Using grid search to find better parameters
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x0000018C7EF748B0>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x0000018C7EF748B0>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(disaster_text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf'

In [17]:
# specifying parameters for grid search
parameters = {
    'clf__estimator__n_estimators' : [50, 100]
}

In [18]:
# creating grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

cv

In [19]:
# Fit the train datasets
cv.fit(X_train, y_train)



In [20]:
# Using cv to find the best parameter
cv.best_params_

{'clf__estimator__n_estimators': 100}

### 7. Test your model

In [21]:
# Performing predictions on the test dataset using cv
y_pred = cv.predict(X_test)

In [22]:
# Test the model using the test and prediction dataset
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.72      0.26      0.38      1482
           1       0.81      0.97      0.88      5019
           2       0.80      0.08      0.14        53

    accuracy                           0.80      6554
   macro avg       0.78      0.44      0.47      6554
weighted avg       0.79      0.80      0.76      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5408
           1       0.89      0.42      0.58      1146

    accuracy                           0.89      6554
   macro avg       0.89      0.71      0.76      6554
weighted avg       0.89      0.89      0.87      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('military',)               precision    recall  f1-score   support

           0       0.97      1.00      0.98      6334
           1       0.62      0.05      0.08       220

    accuracy                           0.97      6554
   macro avg       0.80      0.52      0.53      6554
weighted avg       0.96      0.97      0.95      6554

('child_alone',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6554

    accuracy                           1.00      6554
   macro avg       1.00      1.00      1.00      6554
weighted avg       1.00      1.00      1.00      6554

('water',)               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6111
           1       0.93      0.24      0.38       443

    accuracy                           0.95      6554
   macro avg       0.94      0.62      0.68      6554
weighted avg       0.95      0.95      0.93      6554

('food',)               precisi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('weather_related',)               precision    recall  f1-score   support

           0       0.86      0.97      0.91      4723
           1       0.87      0.59      0.71      1831

    accuracy                           0.86      6554
   macro avg       0.86      0.78      0.81      6554
weighted avg       0.86      0.86      0.85      6554

('floods',)               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6008
           1       0.93      0.40      0.56       546

    accuracy                           0.95      6554
   macro avg       0.94      0.70      0.77      6554
weighted avg       0.95      0.95      0.94      6554

('storm',)               precision    recall  f1-score   support

           0       0.94      0.99      0.96      5947
           1       0.78      0.39      0.52       607

    accuracy                           0.93      6554
   macro avg       0.86      0.69      0.74      6554
weighted avg       0.93      0

In [23]:
# Displaying the accuracy score
accuracy = (y_pred == y_test).mean()
accuracy

('related',)                   0.802106
('request',)                   0.890601
('offer',)                     0.995575
('aid_related',)               0.769759
('medical_help',)              0.923406
('medical_products',)          0.950565
('search_and_rescue',)         0.971468
('security',)                  0.980470
('military',)                  0.967043
('child_alone',)               1.000000
('water',)                     0.947208
('food',)                      0.926457
('shelter',)                   0.927373
('clothing',)                  0.985047
('money',)                     0.977418
('missing_people',)            0.987489
('refugees',)                  0.967043
('death',)                     0.956057
('other_aid',)                 0.874123
('infrastructure_related',)    0.935154
('transport',)                 0.953158
('buildings',)                 0.947818
('electricity',)               0.980317
('tools',)                     0.993592
('hospitals',)                 0.989930


### 8. Try improving your model further.

In [24]:
from custom_transformer import StartingVerbExtractor
from sklearn import multioutput

#trying to add another feature.

upd_pipeline = Pipeline([
    ('features', FeatureUnion ([

        ('text_pipeline', Pipeline ([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
        ('starting_verb', StartingVerbExtractor ())
    ])),

    ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier ()))
])

# train SVM classifier
upd_pipeline.fit(X_train, y_train)


[nltk_data] Downloading package punkt to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Just Me\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [25]:
y_pred_upd = upd_pipeline.predict (X_test)
#converting to dataframe
y_pred_upd = pd.DataFrame (y_pred_upd, columns = y_test.columns)

### 9. Export your model as a pickle file

In [26]:
filename = 'classifier.pkl'
pickle.dump(cv, open(filename, 'wb'))