# ML Pipeline

### 1. Import libraries and load data from database.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table("disaster_messages", con=engine)

In [3]:
# Display df dataframe
df = pd.read_sql("SELECT * from disaster_messages",con=engine)

In [4]:
X = df['message']
y = df.iloc[:, 4:]

### 2. Write a tokenization function to process your text data

In [5]:
# Tokenization function
def tokenize(disaster_text):

    """
    Function to tokenize text.
    """

    tokens = word_tokenize(disaster_text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens=[]
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline

In [6]:
# Building an ML pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [7]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [8]:
# Fit the pipeline
pipeline.fit(X_train, y_train)



### 5. Test your model

In [9]:
# Make a prediction using the test set
y_pred = pipeline.predict(X_test)

In [10]:
# Testing the model
def test_model(y_test, y_pred):

    """
    Function to iterate through columns and call sklearn classification report on each.
    """
    for index, column in enumerate(y_test):
        print(column, classification_report(y_test[column], y_pred[:, index]))

In [11]:
# Testing the model using the test and the prediction
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.73      0.27      0.39      1508
           1       0.81      0.97      0.88      4999
           2       0.50      0.11      0.18        47

    accuracy                           0.80      6554
   macro avg       0.68      0.45      0.48      6554
weighted avg       0.79      0.80      0.76      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5414
           1       0.90      0.42      0.57      1140

    accuracy                           0.89      6554
   macro avg       0.89      0.70      0.75      6554
weighted avg       0.89      0.89      0.87      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6528
           1       0.00      0.00      0.00        26

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('military',)               precision    recall  f1-score   support

           0       0.97      1.00      0.98      6337
           1       0.67      0.02      0.04       217

    accuracy                           0.97      6554
   macro avg       0.82      0.51      0.51      6554
weighted avg       0.96      0.97      0.95      6554

('child_alone',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6554

    accuracy                           1.00      6554
   macro avg       1.00      1.00      1.00      6554
weighted avg       1.00      1.00      1.00      6554

('water',)               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6120
           1       0.89      0.22      0.35       434

    accuracy                           0.95      6554
   macro avg       0.92      0.61      0.66      6554
weighted avg       0.94      0.95      0.93      6554

('food',)               precisi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('weather_related',)               precision    recall  f1-score   support

           0       0.86      0.96      0.91      4703
           1       0.87      0.61      0.72      1851

    accuracy                           0.86      6554
   macro avg       0.86      0.79      0.81      6554
weighted avg       0.86      0.86      0.86      6554

('floods',)               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6036
           1       0.91      0.39      0.55       518

    accuracy                           0.95      6554
   macro avg       0.93      0.69      0.76      6554
weighted avg       0.95      0.95      0.94      6554

('storm',)               precision    recall  f1-score   support

           0       0.94      0.99      0.97      5943
           1       0.81      0.41      0.54       611

    accuracy                           0.94      6554
   macro avg       0.88      0.70      0.75      6554
weighted avg       0.93      0

In [12]:
test_model(y_test, y_pred)

('related',)               precision    recall  f1-score   support

           0       0.73      0.27      0.39      1508
           1       0.81      0.97      0.88      4999
           2       0.50      0.11      0.18        47

    accuracy                           0.80      6554
   macro avg       0.68      0.45      0.48      6554
weighted avg       0.79      0.80      0.76      6554

('request',)               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5414
           1       0.90      0.42      0.57      1140

    accuracy                           0.89      6554
   macro avg       0.89      0.70      0.75      6554
weighted avg       0.89      0.89      0.87      6554

('offer',)               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6528
           1       0.00      0.00      0.00        26

    accuracy                           1.00      6554
   macro avg       0.50      0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('water',)               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6120
           1       0.89      0.22      0.35       434

    accuracy                           0.95      6554
   macro avg       0.92      0.61      0.66      6554
weighted avg       0.94      0.95      0.93      6554

('food',)               precision    recall  f1-score   support

           0       0.92      0.99      0.96      5820
           1       0.84      0.35      0.49       734

    accuracy                           0.92      6554
   macro avg       0.88      0.67      0.72      6554
weighted avg       0.91      0.92      0.90      6554

('shelter',)               precision    recall  f1-score   support

           0       0.93      1.00      0.96      5968
           1       0.85      0.23      0.36       586

    accuracy                           0.93      6554
   macro avg       0.89      0.61      0.66      6554
weighted avg       0.92      0.93      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('weather_related',)               precision    recall  f1-score   support

           0       0.86      0.96      0.91      4703
           1       0.87      0.61      0.72      1851

    accuracy                           0.86      6554
   macro avg       0.86      0.79      0.81      6554
weighted avg       0.86      0.86      0.86      6554

('floods',)               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6036
           1       0.91      0.39      0.55       518

    accuracy                           0.95      6554
   macro avg       0.93      0.69      0.76      6554
weighted avg       0.95      0.95      0.94      6554

('storm',)               precision    recall  f1-score   support

           0       0.94      0.99      0.97      5943
           1       0.81      0.41      0.54       611

    accuracy                           0.94      6554
   macro avg       0.88      0.70      0.75      6554
weighted avg       0.93      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Improve your model

In [13]:
# Using grid search to find better parameters
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x000001B4C637EC20>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x000001B4C637EC20>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(disaster_text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf'

In [14]:
# specifying parameters for grid search
parameters = {
    'clf__estimator__n_estimators' : [50, 150]
}

In [15]:
# creating grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

cv

In [None]:
# Fit the train datasets
cv.fit(X_train, y_train)



In [None]:
# Using cv to find the best parameter
cv.best_params_

### 7. Test your model

In [None]:
# Performing predictions on the test dataset using cv
y_pred = cv.predict(X_test)

In [None]:
# Test the model using the test and prediction dataset
test_model(y_test, y_pred)

In [None]:
# Displaying the accuracy score
accuracy = (y_pred == y_test).mean()
accuracy

### 8. Try improving your model further.

In [None]:
from custom_transformer import StartingVerbExtractor
from sklearn import multioutput

#trying to add another feature.

upd_pipeline = Pipeline([
    ('features', FeatureUnion ([

        ('text_pipeline', Pipeline ([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
        ('starting_verb', StartingVerbExtractor ())
    ])),

    ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier ()))
])

# train SVM classifier
upd_pipeline.fit(X_train, y_train)


In [None]:
y_pred_upd = upd_pipeline.predict (X_test)
#converting to dataframe
y_pred_upd = pd.DataFrame (y_pred_upd, columns = y_test.columns)

### 9. Export your model as a pickle file

In [None]:
filename = 'classifier.pkl'
pickle.dump(cv, open(filename, 'wb'))