# ML Pipeline Preparation
### 1. Import libraries and load data from database.

In [13]:
# import libraries
import pandas as pd
import numpy as np

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from sqlalchemy import create_engine

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
import signal
 
from contextlib import contextmanager
 
import requests
 
 
DELAY = INTERVAL = 4 * 60  # interval time in seconds
MIN_DELAY = MIN_INTERVAL = 2 * 60
KEEPALIVE_URL = "https://nebula.udacity.com/api/v1/remote/keep-alive"
TOKEN_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive_token"
TOKEN_HEADERS = {"Metadata-Flavor":"Google"}
 
 
def _request_handler(headers):
    def _handler(signum, frame):
        requests.request("POST", KEEPALIVE_URL, headers=headers)
    return _handler
 
 
@contextmanager
def active_session(delay=DELAY, interval=INTERVAL):
    """
    Example:
 
    from workspace_utils import active_session
 
    with active_session():
        # do long-running work here
    """
    token = requests.request("GET", TOKEN_URL, headers=TOKEN_HEADERS).text
    headers = {'Authorization': "STAR " + token}
    delay = max(delay, MIN_DELAY)
    interval = max(interval, MIN_INTERVAL)
    original_handler = signal.getsignal(signal.SIGALRM)
    try:
        signal.signal(signal.SIGALRM, _request_handler(headers))
        signal.setitimer(signal.ITIMER_REAL, delay, interval)
        yield
    finally:
        signal.signal(signal.SIGALRM, original_handler)
        signal.setitimer(signal.ITIMER_REAL, 0)
 
 
def keep_awake(iterable, delay=DELAY, interval=INTERVAL):
    """
    Example:
 
    from workspace_utils import keep_awake
 
    for i in keep_awake(range(5)):
        # do iteration with lots of work here
    """
    with active_session(delay, interval): yield from iterable

In [3]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('tDisasterResponse', engine)
X = df['message']
Y = df[df.columns[4:]]

In [4]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process text data

In [5]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for token in tokens:
        cleaned_token = lemmatizer.lemmatize(token).lower().strip()
        clean_tokens.append(cleaned_token)
    return clean_tokens

### 3. Build a machine learning pipeline

In [6]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(estimator = RandomForestClassifier(n_jobs = -1)))
])

### 4. Train pipeline

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [8]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])

### 5. Test model

In [9]:
y_pred = pipeline.predict(X_test)

In [10]:
categories = list(df.columns[4:])

In [11]:
for category in range(len(categories)):
    print("Classification report for: ", categories[category])
    print(classification_report(y_test.iloc[:, category].values, y_pred[:, category]))

Classification report for:  related
             precision    recall  f1-score   support

          0       0.60      0.37      0.46      1526
          1       0.82      0.93      0.87      4974
          2       0.73      0.30      0.42        54

avg / total       0.77      0.79      0.77      6554

Classification report for:  request
             precision    recall  f1-score   support

          0       0.89      0.98      0.93      5467
          1       0.80      0.40      0.53      1087

avg / total       0.88      0.88      0.87      6554

Classification report for:  offer
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      6532
          1       0.00      0.00      0.00        22

avg / total       0.99      1.00      0.99      6554

Classification report for:  aid_related
             precision    recall  f1-score   support

          0       0.74      0.87      0.80      3893
          1       0.75      0.55      0.64      2

  'precision', 'predicted', average, warn_for)


### 6. Improve model

In [14]:
parameters = {
    'clf__estimator__max_depth': [5, 10, 20],
    'clf__estimator__min_samples_leaf': [2, 5, 10],
    'clf__estimator__min_samples_split': [8, 10, 20],
    'clf__estimator__n_estimators': [100, 200]
}

cv = GridSearchCV(pipeline, parameters)

In [13]:
with active_session():
    new_mod = cv.fit(X_train, y_train)

In [15]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__bootstrap', 'clf__estimator__class_weight', 'clf__estimator__criterion', 'clf__estimator__max_depth', 'clf__estimator__max_features', 'clf__estimator__max_leaf_nodes', 'clf__estimator__min_impurity_decrease', 'clf__estimator__min_impurity_split', 'clf__estimator__min_samples_leaf', 'clf__estimator__min_samples_split', 'clf__estimator__min_weight_fraction_leaf', 'clf__estimator__n_estimators', 'clf__estimator__n_jobs', 'clf__estimator__oob_score', 'clf__estimator__random_state', 'clf__estimator__verbose', 'clf__estimator__

### 7. Test model

In [16]:
print(cv.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [17]:
y_pred= new_mod.predict(X_test)

NameError: name 'new_mod' is not defined

In [25]:
for category in range(len(categories)):
    print("Classification report for: ", categories[category])
    print(classification_report(y_test.iloc[:, category].values, y_pred[:, category]))

Classification report for:  related
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1574
          1       0.75      1.00      0.86      4934
          2       0.00      0.00      0.00        46

avg / total       0.57      0.75      0.65      6554

Classification report for:  request
             precision    recall  f1-score   support

          0       0.83      1.00      0.91      5444
          1       0.00      0.00      0.00      1110

avg / total       0.69      0.83      0.75      6554

Classification report for:  offer
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      6526
          1       0.00      0.00      0.00        28

avg / total       0.99      1.00      0.99      6554

Classification report for:  aid_related
             precision    recall  f1-score   support

          0       0.59      1.00      0.74      3869
          1       0.96      0.01      0.02      2

  'precision', 'predicted', average, warn_for)


### 9. Export model as a pickle file

In [None]:
filename = "MLpipeline.sav"
pickle.dump(pipeline, open(filename, 'wb'))