# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# !pip install --upgrade pip
# !pip install xgboost

In [46]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import time
import re
import pickle
import string


import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, TweetTokenizer

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix, make_scorer, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/dima806/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dima806/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dima806/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
ls -lotr *db

-rw-rw-r-- 1 dima806 6438912 Nov  4 19:33 InsertDatabaseName.db
-rw-rw-r-- 1 dima806 6795264 Nov  4 19:34 DisasterResponse.db
-rw-rw-r-- 1 dima806 6438912 Nov 16 16:35 DisasterMessagesDatabase.db


In [5]:
! ls -lotr workspace/data/*db

-rw-r--r-- 1 dima806 6438912 Nov 18 07:02 workspace/data/DisasterMessagesDatabase.db


In [6]:
# load data from database
engine = create_engine('sqlite:///DisasterMessagesDatabase.db')
df = pd.read_sql_table('DisasterMessagesDatabase', engine)
print(df.shape)
X = df.message.values
y = df.drop(['id', 'message', 'original', 'genre'], axis=1).values
#y = df[['related', 'request']].values

(26015, 40)


In [7]:
df.shape, X.shape, y.shape

((26015, 40), (26015,), (26015, 36))

In [8]:
df.genre.unique()

array(['direct', 'social', 'news'], dtype=object)

In [9]:
df.head(2)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,True,False,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False


In [10]:
genre_counts = df.groupby('genre').count()['message']
genre_counts

genre
direct    10696
news      12960
social     2359
Name: message, dtype: int64

In [11]:
genre_names = list(genre_counts.index)
genre_names

['direct', 'news', 'social']

In [12]:
df.genre.unique()

array(['direct', 'social', 'news'], dtype=object)

In [13]:
df.drop(['id', 'message', 'original', 'genre'], axis=1).columns.tolist()

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

### 2. Write a tokenization function to process your text data

In [14]:
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]
len(STOPLIST), len(SYMBOLS)

(378, 36)

In [15]:
def tokenize_stem(text):

    tokens = TweetTokenizer().tokenize(text.lower())
    stemmer = SnowballStemmer('english') # better than PorterStemmer(), see http://www.nltk.org/howto/stem.html
    
    clean_tokens = []
    for tok in tokens:
        clean_tok = stemmer.stem(tok).strip()
        clean_tokens.append(clean_tok)
    clean_tokens = [tok for tok in clean_tokens if tok not in STOPLIST]
    clean_tokens = [tok for tok in clean_tokens if tok not in SYMBOLS]

    return clean_tokens

In [16]:
def tokenize_lemma(text):

    tokens = TweetTokenizer().tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(tok).strip() for tok in tokens]
    clean_tokens = [tok for tok in clean_tokens if tok not in STOPLIST]
    clean_tokens = [tok for tok in clean_tokens if tok not in SYMBOLS]

    
    return clean_tokens

In [17]:
df['message'].apply(lambda x: len(tokenize_lemma(x))).sum()

346837

In [18]:
df.loc[3,'message'], tokenize_lemma(df.loc[3,'message'])

('UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
 ['report',
  'leogane',
  '80-90',
  'destroyed',
  'hospital',
  'st',
  'croix',
  'functioning',
  'need',
  'supply',
  'desperately'])

### 3. Build a machine learning pipeline
- You'll find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [None]:
# def scoring_func(y, y_pred):
#     '''
#     Returns mean accuracy score
    
#     Args:
#         y (np.array of floats):
#         y_pred (np.array of floats):
#     Returns:
#         score (float): score
#     '''
#     return (y == y_pred).mean()

In [19]:
def scoring_func(y, y_pred):
    '''
    Returns mean weighted F-score (important for Grid CV search)
    
    Args:
        y (np.array of floats):
        y_pred (np.array of floats):
    Returns:
        score (float): score
    '''

    return np.mean([f1_score(y[:,i], y_pred[:,i], average='weighted') for i in range(y_pred.shape[1])]) # new scoring  
#     return f1_score(y, y_pred, average='micro')

In [20]:
# def display_results(y_test, y_test_pred, y_train, y_train_pred):
#     test_labels = np.unique(y_test_pred)
#     test_accuracy = scoring_func(y_test, y_test_pred)
#     train_labels = np.unique(y_train_pred)
#     train_accuracy = scoring_func(y_train, y_train_pred)

#     print("Test labels:", test_labels)
#     print("Test accuracy:", test_accuracy)
#     print("Train labels:", train_labels)
#     print("Train accuracy:", train_accuracy)

We start with [Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html#naive-bayes) classifier that [provides a nice baseline](https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568) for topic classification:

In [23]:
def make_pipeline():
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize_lemma, stop_words='english')),
        ('tfidf', TfidfTransformer()),
       ('clf', MultiOutputClassifier(MultinomialNB()))])
#        ('clf', MultiOutputClassifier(XGBClassifier()))
#    ])
    
    return pipeline

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [24]:
seed = 83
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, test_size=0.2)

In [25]:
pipeline = make_pipeline()

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [26]:
%%time
pipeline.fit(X_train, y_train)

CPU times: user 4.8 s, sys: 15.5 ms, total: 4.81 s
Wall time: 4.82 s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ssifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
           n_jobs=1))])

In [27]:
%%time
y_test_pred = pipeline.predict(X_test)

CPU times: user 1.1 s, sys: 3.92 ms, total: 1.1 s
Wall time: 1.1 s


In [28]:
print(classification_report(y_test[:,0], y_test_pred[:,0]))

             precision    recall  f1-score   support

      False       0.22      0.00      0.00      1214
       True       0.77      1.00      0.87      3989

avg / total       0.64      0.77      0.67      5203



In [29]:
best_score = scoring_func(y_test, y_test_pred)
best_score

0.895266497966738

### 6. Improve your model
Use grid search to find better parameters. 

In [32]:
# parameters = {
#     'vect__ngram_range': [(1, 2)],
#     'vect__max_df': [0.8],
#     'vect__max_features': [None],
#     'tfidf__use_idf': [True],
#     'clf__estimator__n_estimators': [100], # add here __estimator as in https://goo.gl/bDiZKM
# }

# cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2, scoring=make_scorer(scoring_func))

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__max_df': [0.125, 0.25, 0.5, 1.0],
    'vect__min_df': [1, 2, 5, 10],
    'vect__max_features': [None, 5000, 10000, 20000],
    'tfidf__use_idf': [True, False],
    'clf__estimator__alpha': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0], # add here __estimator as in https://goo.gl/bDiZKM
}

n_iter = 100 # number of random picks

cv = RandomizedSearchCV(pipeline, param_distributions=parameters, 
                        n_iter=n_iter, cv=3, verbose=2, 
                        scoring=make_scorer(scoring_func), random_state=83)

In [33]:
cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__max_features=5000, vect__max_df=1.0, tfidf__use_idf=True, clf__estimator__alpha=1.0 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__max_features=5000, vect__max_df=1.0, tfidf__use_idf=True, clf__estimator__alpha=1.0, total=   4.3s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__max_features=5000, vect__max_df=1.0, tfidf__use_idf=True, clf__estimator__alpha=1.0 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__max_features=5000, vect__max_df=1.0, tfidf__use_idf=True, clf__estimator__alpha=1.0, total=   4.2s
[CV] vect__ngram_range=(1, 1), vect__min_df=1, vect__max_features=5000, vect__max_df=1.0, tfidf__use_idf=True, clf__estimator__alpha=1.0 
[CV]  vect__ngram_range=(1, 1), vect__min_df=1, vect__max_features=5000, vect__max_df=1.0, tfidf__use_idf=True, clf__estimator__alpha=1.0, total=   4.2s
[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__max_features=5000, vect__max_df=0.125, tfidf__use_idf=False, clf__estimator__alpha=0.05 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__max_features=5000, vect__max_df=0.125, tfidf__use_idf=False, clf__estimator__alpha=0.05, total=   4.2s
[CV] vect__ngram_range=(1, 1), vect__min_df=2, vect__max_features=5000, vect__max_df=0.125, tfidf__use_idf=False, clf__estimator__alpha=0.05 
[CV]  vect__ngram_range=(1, 1), vect__min_df=2, vect__max_features=5000, vect__max_df=0.125, tfidf__use_idf=False, 

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 41.0min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ssifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
           n_jobs=1))]),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vect__max_df': [0.125, 0.25, 0.5, 1.0], 'vect__min_df': [1, 2, 5, 10], 'vect__max_features': [None, 5000, 10000, 20000], 'tfidf__use_idf': [True, False], 'clf__estimator__alpha': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0]},
          pre_dispatch='2*n_jobs', random_state=83, refit=True,
          return_train_score='warn', scoring=make_scorer(scoring_func),
       

In [34]:
cv.best_score_, best_score

(0.9034911089642884, 0.895266497966738)

In [35]:
if cv.best_score_ > best_score:
    best_score = cv.best_score_
    best_model = cv.best_estimator_
best_model

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=20000, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
       ...sifier(estimator=MultinomialNB(alpha=0.02, class_prior=None, fit_prior=True),
           n_jobs=1))])

In [36]:
cv.best_params_

{'clf__estimator__alpha': 0.02,
 'tfidf__use_idf': False,
 'vect__max_df': 0.5,
 'vect__max_features': 20000,
 'vect__min_df': 5,
 'vect__ngram_range': (1, 2)}

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [37]:
%%time
y_test_pred = best_model.predict(X_test)

CPU times: user 1.2 s, sys: 3 µs, total: 1.2 s
Wall time: 1.22 s


In [39]:
for i in range(y_test.shape[1]):
    print('>>>', i, df.drop(['id', 'message', 'original', 'genre'], axis=1).columns[i])
    print(classification_report(y_test[:,i], y_test_pred[:,i]))

>>> 0 related
             precision    recall  f1-score   support

      False       0.35      0.11      0.17      1214
       True       0.78      0.94      0.85      3989

avg / total       0.68      0.75      0.69      5203

>>> 1 request
             precision    recall  f1-score   support

      False       0.87      0.88      0.88      4306
       True       0.41      0.39      0.40       897

avg / total       0.79      0.80      0.80      5203

>>> 2 offer
             precision    recall  f1-score   support

      False       1.00      1.00      1.00      5178
       True       0.00      0.00      0.00        25

avg / total       0.99      0.99      0.99      5203

>>> 3 aid_related
             precision    recall  f1-score   support

      False       0.61      0.75      0.67      3080
       True       0.45      0.29      0.35      2123

avg / total       0.54      0.56      0.54      5203

>>> 4 medical_help
             precision    recall  f1-score   support

      Fal

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

We tried XGBClassifier() instead and add some text stats inspired by [this link]( https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer.html#sphx-glr-auto-examples-compose-plot-column-transformer-py):

In [47]:
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text), 
                 'num_sentences': text.count('.'), 
                 'num_words': len(text.split())}
                for text in posts]

In [63]:
def make_new_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize_lemma, stop_words='english')),
                ('tfidf', TfidfTransformer()),
            ])),
            ('body_stats', Pipeline([
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),
        ])),
        ('clf', MultiOutputClassifier(XGBClassifier()))
    ])

    return pipeline

In [64]:
new_pipeline = make_new_pipeline()

In [65]:
%%time
new_pipeline.fit(X_train, y_train)

CPU times: user 2min 23s, sys: 2.65 s, total: 2min 26s
Wall time: 2min 27s


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, ma...eg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
           n_jobs=1))])

In [66]:
y_test_pred_new = new_pipeline.predict(X_test)

In [67]:
scoring_func(y_test, y_test_pred_new)

0.8965308036416216

There is no significant improvement, so we continue with our best model

### 9. Export your model as a pickle file

In [68]:
filename = 'finalized_model.pkl'
pickle.dump(best_model, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
result = scoring_func(y_test, loaded_model.predict(X_test))
print(result)

0.9023774314044368


### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [69]:
%%writefile train_classifier.py

import sys

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import time
import re
import pickle
import string


import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, TweetTokenizer

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix, make_scorer, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

def load_data(database_filepath):
    engine = create_engine('sqlite:///'+database_filepath)
    df = pd.read_sql_table('DisasterMessagesDatabase', engine)
    X = df.message.values
    y = df.drop(['id', 'message', 'original', 'genre'], axis=1).values
    category_names = df.drop(['id', 'message', 'original', 'genre'], axis=1).columns.tolist()
    return X, y, category_names

STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

def tokenize(text):

    tokens = TweetTokenizer().tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(tok).strip() for tok in tokens]
    clean_tokens = [tok for tok in clean_tokens if tok not in STOPLIST]
    clean_tokens = [tok for tok in clean_tokens if tok not in SYMBOLS]
    
    return clean_tokens


def build_model():
    
    model = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize, 
                                 stop_words='english', 
                                 max_df=0.5, 
                                 max_features=20000,
                                 min_df=5,
                                 ngram_range=(1,2))),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', MultiOutputClassifier(MultinomialNB(alpha=0.02)))
    ])
        
    return model

def evaluate_model(model, X_test, y_test, category_names):

    y_test_pred = model.predict(X_test)
    for i, item in enumerate(category_names):
        print('>>>', item)
        print(classification_report(y_test[:,i], y_test_pred[:,i]))

def save_model(model, model_filepath):
    
    pickle.dump(model, open(model_filepath, 'wb'))


def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()

Overwriting train_classifier.py


In [70]:
ls -lotr train_classifier.py

-rw-r--r-- 1 dima806 4076 Nov 18 10:42 train_classifier.py
