# ML Pipeline

### 1. Import libraries and load data from database.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Just
[nltk_data]     Me\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table("disaster_messages", con=engine)

In [3]:
# Display df dataframe
df = pd.read_sql("SELECT * from disaster_messages",con=engine)

In [4]:
X = df['message']
y = df.iloc[:, 4:]

### 2. Write a tokenization function to process your text data

In [5]:
# Tokenization function
def tokenize(disaster_text):

    """
    Function to tokenize text.
    """

    tokens = word_tokenize(disaster_text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens=[]
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline

In [6]:
# Building an ML pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [7]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
# Fit the pipeline
pipeline.fit(X_train, y_train)



### 5. Test your model

In [None]:
# Make a prediction using the test set
y_pred = pipeline.predict(X_test)

In [None]:
# Testing the model
def test_model(y_test, y_pred):

    """
    Function to iterate through columns and call sklearn classification report on each.
    """
    for index, column in enumerate(y_test):
        print(column, classification_report(y_test[column], y_pred[:, index]))

In [None]:
# Testing the model using the test and the prediction
test_model(y_test, y_pred)

In [None]:
test_model(y_test, y_pred)

### 6. Improve your model

In [None]:
# Using grid search to find better parameters
pipeline.get_params()

In [None]:
# specifying parameters for grid search
parameters = {
    'clf__estimator__n_estimators' : [50, 150]
}

In [None]:
# creating grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)

cv

In [None]:
# Fit the train datasets
cv.fit(X_train, y_train)

In [None]:
# Using cv to find the best parameter
cv.best_params_

### 7. Test your model

In [None]:
# Performing predictions on the test dataset using cv
y_pred = cv.predict(X_test)

In [None]:
# Test the model using the test and prediction dataset
test_model(y_test, y_pred)

In [None]:
# Displaying the accuracy score
accuracy = (y_pred == y_test).mean()
accuracy

### 8. Try improving your model further.

In [None]:
from custom_transformer import StartingVerbExtractor
from sklearn import multioutput

#trying to add another feature.

upd_pipeline = Pipeline([
    ('features', FeatureUnion ([

        ('text_pipeline', Pipeline ([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
        ('starting_verb', StartingVerbExtractor ())
    ])),

    ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier ()))
])

# train SVM classifier
upd_pipeline.fit(X_train, y_train)


In [None]:
y_pred_upd = upd_pipeline.predict (X_test)
#converting to dataframe
y_pred_upd = pd.DataFrame (y_pred_upd, columns = y_test.columns)

### 9. Export your model as a pickle file

In [None]:
filename = 'classifier.pkl'
pickle.dump(cv, open(filename, 'wb'))