In [20]:
import sys
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import pickle

import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report

from textblob import TextBlob

In [21]:
rel_database_filepath = '../data/Disaster_response.db'

def load_data(rel_database_filepath):
    engine = create_engine(f'sqlite:///{rel_database_filepath}')
    df = pd.read_sql_query("SELECT * FROM response_message", engine)
    # 'related' columns has value = 2 which is usual ==> replace it with 1
    df['related'] = np.where(df['related']==2, 1,df['related'])  
    return df['message'], df[df.columns[4:]], df.columns[4:]


def tokenize(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    orig_text = text
    text = re.sub('[^A-Za-z0-9]',' ', text)
    text = text.lower()
    tok_texts = word_tokenize(text) 
    text = [stemmer.stem(lemmatizer.lemmatize(w.strip())) for w in tok_texts if w.strip() not in stopwords.words('english')]
    return text


class TextPolarizer(BaseEstimator, TransformerMixin):

    def getPolarity(self, text):
        return TextBlob(text).sentiment.polarity

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.getPolarity)
        return pd.DataFrame(X_tagged)


def build_model():
    pipeline = Pipeline(steps=[
                   ('features', FeatureUnion([
                       (('tfidf', TfidfVectorizer(tokenizer=tokenize))),
                       ('txt_polar', TextPolarizer())
                   ])),
                   ('clf', MultiOutputClassifier(AdaBoostClassifier()))
        ])
    """Example code to show that I can use GridSearch to tune Parameter."""
    # parameters = {'clf__estimator__min_samples_leaf': [1,2], 
    #             'clf__estimator__min_samples_split': [2,3]}

    # model = GridSearchCV(pipeline, param_grid=parameters)
    return pipeline




def evaluate_model(model, X_test, Y_test, category_names):
    print(classification_report(y_pred=model.predict(X_test), y_true=Y_test, target_names=category_names))
    pass


def save_model(model, model_filepath):
    pickle.dump(model, open(model_filepath, 'wb'))
    pass


In [22]:
import joblib

In [23]:
rel_database_filepath, model_filepath = '../data/Disaster_response.db', './clf_model.pkl'
print('Loading data...\n    DATABASE: {}'.format(rel_database_filepath))
X, Y, category_names = load_data(rel_database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# print('Building model...')
# model = build_model()

# # print("checking category name: ", category_names)
model = joblib.load("../models/clf_model.pkl")
# print('Training model...')
# model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

# print('Saving model...\n    MODEL: {}'.format(model_filepath))
# save_model(model, model_filepath)

# print('Trained model saved!')

Loading data...
    DATABASE: ../data/Disaster_response.db
Evaluating model...
                        precision    recall  f1-score   support

               related       0.96      0.99      0.97      4028
               request       0.98      0.88      0.93       901
                 offer       1.00      0.75      0.86        24
           aid_related       0.96      0.94      0.95      2188
          medical_help       0.99      0.80      0.89       442
      medical_products       1.00      0.84      0.91       293
     search_and_rescue       1.00      0.81      0.89       149
              security       1.00      0.73      0.84        95
              military       0.99      0.82      0.90       163
           child_alone       0.00      0.00      0.00         0
                 water       0.98      0.90      0.94       349
                  food       0.98      0.91      0.94       621
               shelter       0.99      0.89      0.93       449
              clothing  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [29]:
pd.DataFrame(data=model.predict(['Wildfire burns multiple houses in Creed past week']), columns=category_names).head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
