# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [None]:
# import libraries


import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Tokenizing
import re
import nltk
nltk.download('stopwords') 
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
import pickle

In [None]:
engine = create_engine('sqlite:///DisasterResponseDatabase.db') # database
df = pd.read_sql("SELECT * FROM DisasterResponseTable", engine) # dataframe
X = df['message'] # X data
# To get Y data, first find all te column names and select the last 35 columns
colnames = df.columns.tolist() 
Ycolnames = colnames[4:] 
category_names = Ycolnames
y = df[Ycolnames]

In [None]:
genre_counts = df.groupby('genre').count()['message']
genre_names = list(genre_counts.index)
genre_names

### 2. Write a tokenization function to process your text data

In [None]:
def tokenize(text):
    """ 
    This function tokenizes the text data
    Inpute: text
    Output: a list of cleaned tokens (normalized, removed stopwords, lemmatized)
    """

    # Normalize text (remove punctuation characters and make lower case)
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # tokenize text
    words = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in words if word not in stopwords.words("english")]
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer() #[WordNetLemmatizer().lemmatize(word) for word in tokens]
    
    clean_tokens = []
    for tok in tokens:
        ## lemmatize and remove leading/trailing white space
        # clean_tok = lemmatizer.lemmatize(tok).strip()  
        clean_tok = lemmatizer.lemmatize(tok, pos='v').strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [None]:
for message in X[0:1]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')
    

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [None]:
pipeline = Pipeline([
                    ('vect', CountVectorizer(tokenizer=tokenize)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
                    ])

In [None]:
pipeline.get_params() # First, view the model parametes 

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)  # split data
pipeline.fit(X_train, y_train)  # train classifier

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [None]:
# predict on test data
y_pred = pipeline.predict(X_test)

In [None]:
column_names = y_test.columns.tolist()

accuracy = (y_pred == y_test).mean()

print('Total average Accuracy is:', accuracy.mean())
print('Accuracy of each group: \n',  accuracy, sep='')

col_number = 0
for col in column_names:
    print('Results for column: ', col)
    print(classification_report(y_test[col], y_pred[:,col_number]) )
    col_number += 1
    


Although the accuray is high, the precion, recall, and f1-score are very low. This might be due to imblanaced data (too much 0 values compared to 1 values). We can use class_weigh='balanced' in the classifier.

### 6. Improve your model
Use grid search to find better parameters. 

In [None]:
# small number of of parameter values are used due to high computational time !!
parameters = {
        'clf__estimator__class_weight': ['balanced'],
        'clf__estimator__min_samples_split': [4],
        'clf__estimator__n_estimators':[10, 25]
        }

cv =  GridSearchCV(pipeline, param_grid=parameters)
cv.get_params().keys()

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [None]:
cv.fit(X_train, y_train)


In [16]:
def evaluate_model(model, X_test, y_test, category_names):
    """ Function to evaluate the model performance'
    Input: (fitted model: model), (test dataframe: X_test), (test dataframe: y_test),
            (A list of names of categories: category_names)
    Output: Averge accuracy of all categories, individual accuracies of all groups,
            Precision, recall, and f1-scofe of all labels in each category
    """
    
    y_pred = model.predict(X_test)
    
    accuracy_list = []; precision_list = []; recall_list = []; f1_list = []
    col_number = 0
    for col in category_names: 
        accuracy = (y_test[col]==y_pred[:,col_number]).mean()
        precision = precision_score(y_test[col], y_pred[:,col_number])
        recall = recall_score(y_test[col], y_pred[:,col_number])
        f1 = f1_score(y_test[col], y_pred[:,col_number])
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        col_number += 1
        
    scores = pd.DataFrame({'Category':category_names,
                          'Accuracy':accuracy_list,
                          'Precision': precision_list,
                          'Recall':recall_list,
                          'f1':f1_list})
    
    print('Average Accuracy is:', scores['Accuracy'].mean())
    print('Average precision is:', scores['Precision'].mean())
    print('Average recall is:', scores['Recall'].mean())
    print('Average f1-score is:', scores['f1'].mean())
    print(scores)
 
    return scores

evaluate_model(cv, X_test, y_test, category_names)


Average Accuracy is: 0.946183355857
Average precision is: 0.550376565996
Average recall is: 0.246378361349
Average f1-score is: 0.300539295648
                  Category  Accuracy  Precision    Recall        f1
0                  related  0.819957   0.850859  0.925055  0.886407
1                  request  0.895026   0.738562  0.602131  0.663405
2                    offer  0.996033   0.000000  0.000000  0.000000
3              aid_related  0.770369   0.708011  0.749349  0.728094
4             medical_help  0.922490   0.539535  0.220952  0.313514
5         medical_products  0.954379   0.733333  0.134557  0.227390
6        search_and_rescue  0.972383   0.500000  0.044199  0.081218
7                 security  0.982148   0.250000  0.008696  0.016807
8                 military  0.969179   0.561644  0.194313  0.288732
9                    water  0.954226   0.807692  0.392523  0.528302
10                    food  0.942173   0.814558  0.633423  0.712661
11                 shelter  0.940189   0.

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Category,Accuracy,Precision,Recall,f1
0,related,0.819957,0.850859,0.925055,0.886407
1,request,0.895026,0.738562,0.602131,0.663405
2,offer,0.996033,0.0,0.0,0.0
3,aid_related,0.770369,0.708011,0.749349,0.728094
4,medical_help,0.92249,0.539535,0.220952,0.313514
5,medical_products,0.954379,0.733333,0.134557,0.22739
6,search_and_rescue,0.972383,0.5,0.044199,0.081218
7,security,0.982148,0.25,0.008696,0.016807
8,military,0.969179,0.561644,0.194313,0.288732
9,water,0.954226,0.807692,0.392523,0.528302


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [47]:
pipeline_DT = Pipeline([
                    ('vect', CountVectorizer(tokenizer=tokenize)),
                    ('tfidf', TfidfTransformer()),
                    ('clf_DT', DecisionTreeClassifier())
                    ])

# pipeline_DT.get_params() # view the model parametes 

parameters = {
        'clf_DT__class_weight': ['balanced'],
        'clf_DT__criterion': ['gini', 'entropy'],
        'clf_DT__min_samples_split':[2, 5]}


cv_DT =  GridSearchCV(pipeline_DT, param_grid=parameters)

cv_DT.get_params().keys()


dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__vect', 'estimator__tfidf', 'estimator__clf_DT', 'estimator__vect__analyzer', 'estimator__vect__binary', 'estimator__vect__decode_error', 'estimator__vect__dtype', 'estimator__vect__encoding', 'estimator__vect__input', 'estimator__vect__lowercase', 'estimator__vect__max_df', 'estimator__vect__max_features', 'estimator__vect__min_df', 'estimator__vect__ngram_range', 'estimator__vect__preprocessor', 'estimator__vect__stop_words', 'estimator__vect__strip_accents', 'estimator__vect__token_pattern', 'estimator__vect__tokenizer', 'estimator__vect__vocabulary', 'estimator__tfidf__norm', 'estimator__tfidf__smooth_idf', 'estimator__tfidf__sublinear_tf', 'estimator__tfidf__use_idf', 'estimator__clf_DT__class_weight', 'estimator__clf_DT__criterion', 'estimator__clf_DT__max_depth', 'estimator__clf_DT__max_features', 'estimator__clf_DT__max_leaf_nodes', 'estimator__clf_DT__min_impurity_decrease', 'estimator__clf_DT_

In [48]:
cv_DT.fit(X_train, y_train)

evaluate_model(cv_DT, X_test, y_test, category_names)

Average Accuracy is: 0.917764505863
Average precision is: 0.275816171006
Average recall is: 0.227219070103
Average f1-score is: 0.247374777635
                  Category  Accuracy  Precision    Recall        f1
0                  related  0.700183   0.837820  0.750452  0.791733
1                  request  0.837656   0.528336  0.513321  0.520721
2                    offer  0.992524   0.000000  0.000000  0.000000
3              aid_related  0.675313   0.622865  0.528821  0.572003
4             medical_help  0.879921   0.185096  0.146667  0.163656
5         medical_products  0.927678   0.204819  0.155963  0.177083
6        search_and_rescue  0.957431   0.139706  0.104972  0.119874
7                 security  0.974519   0.117647  0.069565  0.087432
8                 military  0.956363   0.218045  0.137441  0.168605
9                    water  0.943088   0.564103  0.565421  0.564761
10                    food  0.921422   0.656552  0.641509  0.648943
11                 shelter  0.900671   0.

Unnamed: 0,Category,Accuracy,Precision,Recall,f1
0,related,0.700183,0.83782,0.750452,0.791733
1,request,0.837656,0.528336,0.513321,0.520721
2,offer,0.992524,0.0,0.0,0.0
3,aid_related,0.675313,0.622865,0.528821,0.572003
4,medical_help,0.879921,0.185096,0.146667,0.163656
5,medical_products,0.927678,0.204819,0.155963,0.177083
6,search_and_rescue,0.957431,0.139706,0.104972,0.119874
7,security,0.974519,0.117647,0.069565,0.087432
8,military,0.956363,0.218045,0.137441,0.168605
9,water,0.943088,0.564103,0.565421,0.564761


### 9. Export your model as a pickle file

In [49]:
 with open('classifier.pkl', 'wb') as f:
        pickle.dump(cv, f) 

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

# 11. Refractoring

In [None]:
def load_data(database_filepath):
    """ Function to load the tables
    Input: database filepath
    Output: dataframes of features X and target y
    """
    engine = create_engine(database_filepath) # database
    df = pd.read_sql("SELECT * FROM DisasterResponseTable", engine) # dataframe
    X = df['message'] # X data
    # To get Y data, first find all te column names and select the last 36 columns
    colnames = df.columns.tolist() 
    Ycolnames = colnames[4:] 
    y = df[Ycolnames]
    return X, y, Ycolnames
        


def tokenize(text):
    """ 
    This function tokenizes the text data
    Inpute: text
    Output: a list of cleaned tokens (normalized, removed stopwords, lemmatized)
    """

    # Normalize text (remove punctuation characters and make lower case)
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # tokenize text
    words = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in words if word not in stopwords.words("english")]
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer() #[WordNetLemmatizer().lemmatize(word) for word in tokens]
    
    clean_tokens = []
    for tok in tokens:
        ## lemmatize and remove leading/trailing white space
        # clean_tok = lemmatizer.lemmatize(tok).strip()  
        clean_tok = lemmatizer.lemmatize(tok, pos='v').strip()
        clean_tokens.append(clean_tok)

    return clean_tokens



def build_model(pipeline_num=1):
    """ Function to build the classifier model
    Input: pipeline_num (1: RandomForestClassifier), (2: DecisionTreeClassifier)
    Output: pipeline
    """
    
    if (pipeline_num==1):
        pipeline = Pipeline([
                        ('vect', CountVectorizer(tokenizer=tokenize)),
                        ('tfidf', TfidfTransformer()),
                        ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42, 
                                                                             class_weight='balanced',
                                                                            min_samples_split=4,
                                                                            n_estimators=25)))
                        ])

        print('\n Pipelie parameters are: \n', pipeline.get_params()) # view the model parametes
    
    else:
        pipeline = Pipeline([
                    ('vect', CountVectorizer(tokenizer=tokenize)),
                    ('tfidf', TfidfTransformer()),
                    ('clf_DT', MultiOutputClassifier(DecisionTreeClassifier()))
                    ])
        print('\nPipelie parameters are: \n', pipeline.get_params()) # view the model parametes
        
    return pipeline



def cal_score(y_test, y_pred):
    """ Function to calculate the scoring criteria for grid seaech
    Input: test and predicted columns
    Output: average of f1 score for all columns
    """
    f1_list = []
    for i in range(np.shape(y_pred)[1]):
        f1 = f1_score(np.array(y_test)[:, i], y_pred[:, i])
        f1_list.append(f1)
        
    return sum(f1_list)/len(f1_list)
    
    
def build_model_gridSearch(pipeline):
    """ Function to build the classifier model with grid search
    Input: nothing (should modify in the body)
    Output: gread search object
    """

    parameters = {
                'clf__estimator__class_weight': ['balanced'],
                'clf__estimator__min_samples_split': [2, 5, 8],
                'clf__estimator__n_estimators':[10, 25, 50]
                }
    
    scoring = make_scorer(cal_score)
    cv =  GridSearchCV(pipeline, param_grid=parameters, verbose=1, scoring=scoring)
    cv.get_params().keys()

    return cv



def evaluate_model(model, X_test, y_test, category_names):
    """ Function to evaluate the model performance'
    Input: (fitted model: model), (test dataframe: X_test), (test dataframe: y_test),
            (A list of names of categories: category_names)
    Output: Averge accuracy of all categories, individual accuracies of all groups,
            Precision, recall, and f1-scofe of all labels in each category
    """
    
    y_pred = model.predict(X_test)
    
    accuracy_list = []; precision_list = []; recall_list = []; f1_list = []
    col_number = 0
    for col in category_names: 
        accuracy = (y_test[col]==y_pred[:,col_number]).mean()
        precision = precision_score(y_test[col], y_pred[:,col_number])
        recall = recall_score(y_test[col], y_pred[:,col_number])
        f1 = f1_score(y_test[col], y_pred[:,col_number])
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        col_number += 1
        
    scores = pd.DataFrame({'Category':category_names,
                          'Accuracy':accuracy_list,
                          'Precision': precision_list,
                          'Recall':recall_list,
                          'f1':f1_list})
 
    return scores


def display_results(model, scores):
    """ Function to display results
    Input: -
    Output: print statements for different scores
    """
    print('------------------- Results for the best model with X_test -------------------')
    print('Average Accuracy is:', scores['Accuracy'].mean())
    print('Average precision is:', scores['Precision'].mean())
    print('Average recall is:', scores['Recall'].mean())
    print('Average f1-score is:', scores['f1'].mean())
    print(scores)
    
    print('\n .... Best parameters for the model are:\n')
    for param in  model.best_params_.keys():
        print('\t' + param +': ', model.best_params_[param])
    
    print('\n ..... General model information:')
    print(model.cv_results_)

    
           
def save_model(model, model_filepath):
    with open(model_filepath, 'wb') as f:
        pickle.dump(model, f)        


        
        
        
        

import sys
database_filepath = 'sqlite:///DisasterResponseDatabase.db'
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, y, category_names = load_data(database_filepath)

X = X.head(100)  # uncomment
y = y.head(100)  # uncomment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_name = 'RandomForest'
gridsearch = 1


if gridsearch==1:
    
    model_name = 'RandomForest'
    print('\n Building model', model_name, ' with grid search ....\n')
    model= build_model(pipeline_num=1)
    
    cv = build_model_gridSearch(model)
    
    print('\n Training model', model_name, ' with grid search ....\n')
    cv.fit(X_train, y_train)
    
    print('\n Evaluating model', model_name, 'with grid search.... \n')
    scores = evaluate_model(cv, X_test, y_test, category_names)
    
    print('\n Saving model', model_name, '....\n')
    save_model(cv, 'classifier.pkl')
    
    display_results(cv, scores)
    
    print('\n\n .... Trained model saved!')
    
    
else:
        
    print('\n Building model', model_name, '....\n')
    model= build_model(pipeline_num=1)
    print('\n Training model', model_name, '....\n')
    model.fit(X_train, y_train)
    print('\n Evaluating model', model_name, '.... \n')
    scores = evaluate_model(model, X_test, y_test, category_names)
    print(scores)
    print('\n Saving model', model_name, '....\n')
    save_model(model, 'classifier.pkl')
    print('\n\n .... Trained model saved!')

        


In [None]:
def read_from_pickle(path):
    with open(path, 'rb') as file:
        try:
            while True:
                yield pickle.load(file)
        except EOFError:
            pass

In [None]:
for item in read_from_pickle('classifier.pkl'):
        print(repr(item))
