In [38]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fergus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Fergus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [39]:
# load data from database
database_name = "myDisasterDatabase.db"
output_model_file = "disaster_model"
engine_name = 'sqlite:///' + database_name
engine = create_engine(engine_name)
df =pd.read_sql("SELECT * FROM messages_table", engine)
X = df['message']
Y = df.drop(['id','message','original','genre'], axis=1)

###  Functions

In [40]:
def tokenize(text):
    """
    returns a tokenized, lemmatized and normalized version of text

    Args:
        text (str): input text to be tokenized, lemmatized and normalized 
    
    Returns:
        Tokenized, lemmatized and normalized version of text
    """
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [41]:
def print_score(y_actual, y_pred, measure):
    """
    Creates a pretty print of the results of sklearns classification report comparing y_actual and y_pred

    Args:
        y_actual (dataframe): expected values
        y_pred (dataframe): predicted values
        measure (str): choice of measure ('weighted avg','micro avg','macro avg' )
    
 
    """
    print("\t\tWeighted Average Scores Over Each Output Class\n")
    print("\t\tPrecision\tRecall\t\tF1_Score")
    for column_name, column in y_actual.iteritems():
        report  = classification_report(y_actual[column_name], y_pred[column_name], output_dict=True )
        prec = report[measure]['precision']
        recall =  report[measure]['recall']
        f1 = report[measure]['f1-score']
        print("%20.2f %15.2f % 15.2f" % (prec, recall, f1) + "\t\t" + column_name )

### Machine learning pipeline


In [42]:
#Choosing a straighforward single tree model to make training tractable in terms of time
DTC = DecisionTreeClassifier(random_state = 11)

pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=DTC))
    ])

In [43]:
#Split the input data
X_train, X_test, y_train, y_test = train_test_split(X, Y)

#Check check that dataframes are of expected size
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(19635,)
(6545,)
(19635, 36)
(6545, 36)


In [44]:
y_test.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
800,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14787,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
6430,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24843,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16219,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [45]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...on_leaf=0.0, presort=False, random_state=11,
            splitter='best'),
           n_jobs=None))])

### Test model

In [46]:
#Make predictions with the model
y_pred = pipeline.predict(X_test)
#convert numpy output to dataframe and add columns
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.columns = y_test.columns

#Convert predictions and correct y values to float for faciliate comparison
y_pred_df = y_pred_df.astype('float64')
y_test = y_test.astype('float64')
y_pred_df.head(5)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
print_score(y_test, y_pred_df, 'weighted avg')

		Weighted Average Scores Over Each Output Class

		Precision	Recall		F1_Score
                0.74            0.74            0.74		related
                0.85            0.86            0.85		request
                0.99            0.99            0.99		offer
                0.70            0.70            0.70		aid_related
                0.89            0.89            0.89		medical_help
                0.94            0.94            0.94		medical_products
                0.96            0.96            0.96		search_and_rescue
                0.97            0.97            0.97		security
                0.96            0.96            0.96		military
                1.00            1.00            1.00		child_alone
                0.95            0.95            0.95		water
                0.93            0.94            0.93		food
                0.94            0.94            0.94		shelter
                0.98            0.99            0.99		clothing
                0.97     

### Using Gridsearch to optimise


In [48]:
parameters = {'clf__estimator__criterion': ["gini", "entropy"],
              'clf__estimator__splitter': ["best", "random"],
              'clf__estimator__max_depth': randint(3, 6),
              'clf__estimator__min_samples_split': randint(2,6)}

#to faciliate easy experimentation, allow for reduced input data to gridsearch
gridsearch_percent_of_dataset = 20

#calculate reduced sample size according to 'gridsearch_percent_of_dataset'
len_full_dataset = len(X_train)
sample_size = int((len_full_dataset/100)*gridsearch_percent_of_dataset)

grid_obj = RandomizedSearchCV(pipeline,parameters,n_iter=5, cv=5 )
grid_obj.fit(X_train[:sample_size], y_train[:sample_size])

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...on_leaf=0.0, presort=False, random_state=11,
            splitter='best'),
           n_jobs=None))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'clf__estimator__criterion': ['gini', 'entropy'], 'clf__estimator__splitter': ['best', 'random'], 'clf__estimator__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000AEF0BCCC0>, 'clf__estimator__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000AEF0BC780>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          ret

###  Retest

In [49]:
#Retest after gridsearch

optimised_classifier = grid_obj.best_estimator_
#Refit the classifier
optimised_classifier.fit(X_train[:sample_size], y_train[:sample_size])
# Make predictions and score using the optimised model
predictions = optimised_classifier.predict(X_test)
predictions = pd.DataFrame(predictions)
predictions.columns = y_test.columns
predictions = predictions.astype('float64')

print_score(y_test, predictions, 'weighted avg')

		Weighted Average Scores Over Each Output Class

		Precision	Recall		F1_Score
                0.71            0.76            0.68		related
                0.85            0.86            0.83		request
                0.99            1.00            0.99		offer
                0.69            0.67            0.63		aid_related
                0.89            0.92            0.89		medical_help
                0.95            0.95            0.94		medical_products
                0.97            0.97            0.97		search_and_rescue
                0.97            0.98            0.97		security
                0.96            0.97            0.96		military
                1.00            1.00            1.00		child_alone
                0.95            0.96            0.95		water
                0.95            0.95            0.95		food
                0.94            0.95            0.94		shelter
                0.99            0.99            0.99		clothing
                0.97     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


                0.98            0.99            0.98		hospitals
                0.99            0.99            0.99		shops
                0.98            0.99            0.99		aid_centers
                0.93            0.96            0.94		other_infrastructure
                0.83            0.81            0.79		weather_related
                0.94            0.94            0.94		floods
                0.93            0.94            0.93		storm
                0.99            0.99            0.99		fire
                0.97            0.97            0.97		earthquake
                0.98            0.98            0.98		cold
                0.93            0.95            0.93		other_weather
                0.83            0.84            0.82		direct_report


### Export to pickle file

In [134]:
pickle.dump(optimised_classifier, open(output_model_file, 'wb'))