# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [2]:
# import libraries
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.pipeline import Pipeline
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /Users/ethango/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ethango/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
data/DisasterResponse.db

In [3]:
# load data from database
engine = create_engine('sqlite:///InsertDatabaseName.db')
df = pd.read_sql_table('df',engine)
X = df.message.values
Y = df.iloc[:, 4:].values

In [4]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2. Tokenization function

In [5]:
def tokenize(text):
    token = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in token:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Building a machine learning pipeline

In [6]:
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

### 4. Train pipeline

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
     X, Y, test_size=0.33, random_state=42)

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

### 5. Testing Model

In [8]:
# Must convert to dataframe first
y_pred = pipeline.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns = df.columns[4:])
y_test = pd.DataFrame(y_test, columns = df.columns[4:])

for i, var in enumerate(df.columns[4:]):
    print("For label " + var + " this is the result.")
    print(classification_report(y_test.iloc[:,i], 
                                y_pred.iloc[:,i]))

For label related this is the result.
              precision    recall  f1-score   support

         0.0       0.38      0.02      0.05      2042
         1.0       0.76      0.99      0.86      6546
         2.0       0.10      0.02      0.03        61

    accuracy                           0.75      8649
   macro avg       0.41      0.34      0.31      8649
weighted avg       0.66      0.75      0.66      8649

For label request this is the result.
              precision    recall  f1-score   support

         0.0       0.83      0.99      0.91      7194
         1.0       0.39      0.02      0.04      1455

    accuracy                           0.83      8649
   macro avg       0.61      0.51      0.47      8649
weighted avg       0.76      0.83      0.76      8649

For label offer this is the result.
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8615
         1.0       0.00      0.00      0.00        34

    accuracy    

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      8542
         1.0       0.00      0.00      0.00       107

    accuracy                           0.99      8649
   macro avg       0.49      0.50      0.50      8649
weighted avg       0.98      0.99      0.98      8649

For label other_infrastructure this is the result.
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      8263
         1.0       0.00      0.00      0.00       386

    accuracy                           0.95      8649
   macro avg       0.48      0.50      0.49      8649
weighted avg       0.91      0.95      0.93      8649

For label weather_related this is the result.
              precision    recall  f1-score   support

         0.0       0.75      0.97      0.84      6256
         1.0       0.64      0.13      0.22      2393

    accuracy                           0.74      8649
   macro avg       0.69      0.

### 6. Improve the model

In [9]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x7fbea85e0950>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_depth=No

In [10]:

parameters = {
    'tfidf__smooth_idf': [False, True],
    'clf__estimator__n_estimators': [1,10,20,30],
    'clf__estimator__n_jobs': [1,2,3],
    
}

cv = GridSearchCV(pipeline, param_grid = parameters)

### 7. Testing the model

In [11]:
cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        p

In [12]:
y_pred = cv.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns = df.columns[4:])
for i, var in enumerate(df.columns[4:]):
    print("For label " + var + " this is the result.")
    print(classification_report(y_test.iloc[:,i], 
                                y_pred.iloc[:,i]))

For label related this is the result.
              precision    recall  f1-score   support

         0.0       0.34      0.04      0.07      2042
         1.0       0.76      0.98      0.85      6546
         2.0       0.00      0.00      0.00        61

    accuracy                           0.75      8649
   macro avg       0.37      0.34      0.31      8649
weighted avg       0.65      0.75      0.66      8649

For label request this is the result.
              precision    recall  f1-score   support

         0.0       0.84      0.99      0.91      7194
         1.0       0.47      0.05      0.09      1455

    accuracy                           0.83      8649
   macro avg       0.65      0.52      0.50      8649
weighted avg       0.78      0.83      0.77      8649

For label offer this is the result.
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8615
         1.0       0.00      0.00      0.00        34

    accuracy    

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      8452
         1.0       0.14      0.01      0.01       197

    accuracy                           0.98      8649
   macro avg       0.56      0.50      0.50      8649
weighted avg       0.96      0.98      0.97      8649

For label missing_people this is the result.
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      8544
         1.0       0.00      0.00      0.00       105

    accuracy                           0.99      8649
   macro avg       0.49      0.50      0.50      8649
weighted avg       0.98      0.99      0.98      8649

For label refugees this is the result.
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      8319
         1.0       0.00      0.00      0.00       330

    accuracy                           0.96      8649
   macro avg       0.48      0.50      0.49 

In [14]:
cv.best_params_

{'clf__estimator__n_estimators': 30,
 'clf__estimator__n_jobs': 2,
 'tfidf__smooth_idf': True}

In [15]:
cv.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   