*Updated July 11, 2020*

# ML Pipeline Preparation

This notebook outlines the preparation of the machine learning pipeline, which is streamlined in the train_classifer.py script.

In [1]:
# imports
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import warnings
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
warnings.simplefilter('ignore')

print("Imports successful. ")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mehrnazsiavoshi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Imports successful. 


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mehrnazsiavoshi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
# upload database

database_filepath = "../data/DisasterResponse.db"
engine = create_engine('sqlite:///' + database_filepath)
df = pd.read_sql_table('Messages',engine)

df.head()

Unnamed: 0,id,message,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,direct,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,Is the Hurricane over or is it not over,direct,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,8,Looking for someone but no name,direct,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,"says: west side of Haiti, rest of the country ...",direct,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# first run of the program gave a NaN error, so these rows
# will be removed

df =df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

In [46]:
# dependent variable
x = df['message']

# explanatory variables (categories)
y = df.drop(['id', 'message', 'genre'], axis = 1)

In [47]:
# tokenization function 
def tokenize(text):
    """
    Processes text data
    
    Input: messages
    
    Output: List of words as numbers
    """
    # normalizing text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenizing
    token = word_tokenize(text)
    
    # removal of stop words
    normalizer = PorterStemmer()
    stop_words = stopwords.words('english')
    
    normalized = [normalizer.stem(word) for word in token if word not in stop_words]
    
    return normalized

In [48]:
# machine learning pipeline that takes in message and outputs
# classification results on the 36 variables/categories

pipeline = Pipeline([('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))])

In [49]:
# split data

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=13)

# fit data
pipe_fit = pipeline.fit(x_train, y_train)

In [50]:
# model test with the f1 score, precision, and recall for each category

def evaulation(ArrayL, ArrayP, names):
    """
    Evaulates the ML pipeline
    
    Inputs: ArrayL with real labels
    ArrayP with predicted labels
    Names with each of the array fields
    
    Output: all_metrics with accuracy, precision, and recall f1 score
    """
    metrics = []
    
    for i in range(len(names)):
        accuracy = accuracy_score(ArrayL[:, i], ArrayP[:,i])
        precision = precision_score(ArrayL[:, i], ArrayP[:, i])
        recall = recall_score(ArrayL[:, i], ArrayP[:, i])
        f1 = f1_score(ArrayL[:, i], ArrayP[:, i])
        
        metrics.append([accuracy, precision, recall, f1])
        
    metrics = np.array(metrics)
    all_metrics = pd.DataFrame(data=metrics, index=names, columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    return all_metrics

In [51]:
# evaulate the training set 

y_train_predictions = pipeline.predict(x_train)

names = list(y.columns.values)

print(evaulation(np.array(y_train), y_train_predictions, names))

# as expected for the training set, all values are very close to 1
# indicating a good model

                        Accuracy  Precision    Recall  F1 Score
related                 0.997951   0.998929  0.998395  0.998662
request                 0.999283   0.999403  0.996426  0.997912
offer                   0.999898   1.000000  0.976190  0.987952
aid_related             0.998771   0.999261  0.997785  0.998522
medical_help            0.999488   0.999343  0.994114  0.996721
medical_products        0.999744   0.998968  0.995885  0.997424
search_and_rescue       0.999846   0.998158  0.996324  0.997240
security                0.999641   0.997059  0.982609  0.989781
military                0.999898   0.998454  0.998454  0.998454
water                   0.999898   1.000000  0.998431  0.999215
food                    0.999846   1.000000  0.998639  0.999319
shelter                 0.999898   0.999425  0.999425  0.999425
clothing                0.999949   0.996656  1.000000  0.998325
money                   0.999949   0.997753  1.000000  0.998875
missing_people          0.999949   1.000

In [52]:
# evaulate the test set

y_test_predictions = pipeline.predict(x_test)

print(evaulation(np.array(y_test), y_test_predictions, names))

# accuracy values are all relatively high, but precision, recall, and 
# F1 all have room for improvement 

                        Accuracy  Precision    Recall  F1 Score
related                 0.822960   0.840681  0.946912  0.890640
request                 0.896266   0.849445  0.480287  0.613623
offer                   0.994775   0.000000  0.000000  0.000000
aid_related             0.784386   0.762525  0.707023  0.733726
medical_help            0.919318   0.788462  0.073874  0.135091
medical_products        0.950054   0.785714  0.064516  0.119241
search_and_rescue       0.973260   0.714286  0.055556  0.103093
security                0.980483   0.000000  0.000000  0.000000
military                0.968342   0.818182  0.042254  0.080357
water                   0.955894   0.916667  0.304786  0.457467
food                    0.946980   0.832442  0.650418  0.730258
shelter                 0.936069   0.804598  0.365217  0.502392
clothing                0.985400   0.772727  0.158879  0.263566
money                   0.976179   1.000000  0.031250  0.060606
missing_people          0.988474   1.000

In [54]:
# export the model as a pickle file

m = pickle.dumps('classifier.pkl')