In [42]:
import re
import sys
import time
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords', 'omw-1.4'])
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /Users/jwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jwang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/jwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/jwang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql('SELECT * FROM jwang_disaster_pipeline', engine)
X = df.message.values
y = df.drop(['id', 'message'], axis=1).values

In [12]:
def tokenize(text):
    tokens = word_tokenize(re.sub(r"[^a-zA-Z0-9]", " ", text))
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(w, pos='v').lower().strip() for w in tokens if w not in stopwords.words("english")]

    return clean_tokens


In [35]:
pipeline = Pipeline(
        [
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', MultiOutputClassifier(RandomForestClassifier()))
        ]
    )
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x134db7dc0>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x134db7dc0>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_i

In [36]:
parameters = {
    'clf__estimator__n_estimators': [100],
    'clf__min_samples_split': [2]
}

In [37]:
cv = GridSearchCV(pipeline, param_grid=parameters)

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
cv.fit(X_train, Y_train)
predicted = cv.predict(X_test)
print(predicted)

[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [44]:
start = time.time()
print(classification_report(Y_test, predicted))
print(predicted)
end = time.time()
exec_time = end - start
print("The execution time is: {}".format(exec_time))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89      4021
           1       0.84      0.51      0.63       933
           2       0.00      0.00      0.00        23
           3       0.76      0.70      0.73      2180
           4       0.65      0.09      0.16       395
           5       0.77      0.06      0.12       271
           6       0.79      0.12      0.20       130
           7       0.00      0.00      0.00        86
           8       0.70      0.04      0.08       156
           9       0.00      0.00      0.00         0
          10       0.93      0.40      0.56       356
          11       0.82      0.62      0.71       563
          12       0.81      0.37      0.51       479
          13       0.89      0.10      0.17        84
          14       1.00      0.03      0.05       111
          15       1.00      0.03      0.06        64
          16       0.38      0.02      0.04       152
          17       0.83    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
