# ML Pipeline Preparation and Exploration

### 1. Import libraries and load data from database.

In [126]:
# import libraries
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import time

#load nltk libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

#load sklearn text transformation libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#load sklearn ML libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
# load data from database
engine = create_engine('sqlite:///DisasterMessages.db')
df = pd.read_sql('SELECT * FROM DisasterMessages',engine)
X = df['message'].values #sklearn requires algorithm to be in numpy array format
Y = df.iloc[:,4:].values #sklearn requires algorithm to be in numpy array format

In [21]:
# create small sample to optimise code
df_sample = df.sample(1000)
X_sample = df_sample['message'].values
Y_sample = df_sample.iloc[:,4:].values

In [117]:
df.head(1)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Define a tokenization function to process the text data

In [6]:
def tokenize(text):
    #remove punctuation
    pattern = '[^A-Za-z0-9]'
    text = re.sub(pattern, ' ', text)
    
    #convert to lowercase
    text = text.lower().strip()
    
    #tokenize
    words = word_tokenize(text)
    
    #remove stopwords (common words that don't add much meaning)
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]
    
    #lemmatize nouns (convert words to their roots)
    words = [WordNetLemmatizer().lemmatize(word, pos='n') for word in words]
    
    #lemmatize verbs
    words = [WordNetLemmatizer().lemmatize(word, pos='v') for word in words]
    
    #stem words (reduce words to their stem)
    words = [PorterStemmer().stem(word) for word in words]
    
    return words

### 3. Build machine learning pipeline

In [7]:
#create pipeline
pipeline = Pipeline([
    ('vect',CountVectorizer(tokenizer = tokenize)),
    ('tfidf',TfidfTransformer()),
    ('clf',MultiOutputClassifier(RandomForestClassifier()))    
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [26]:
# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, Y_sample, test_size=0.33)

# fit pipeline
model = pipeline.fit(X_train,y_train)

### 5. Test model

In [11]:
# predict test set
y_pred = model.predict(X_test)

# create function to iterate through columns and display classification report
def display_results(y_test,y_pred):
    
    number_of_columns = y_test.shape[1]
    for i in range(number_of_columns):
        print(df.columns[4+i].capitalize())
        print(classification_report(y_test[:,i],y_pred[:,i]))
        
display_results(y_test,y_pred)

Related
              precision    recall  f1-score   support

           0       0.64      0.09      0.16        74
           1       0.77      0.98      0.87       251
           2       0.00      0.00      0.00         5

    accuracy                           0.77       330
   macro avg       0.47      0.36      0.34       330
weighted avg       0.73      0.77      0.70       330

Request
              precision    recall  f1-score   support

           0       0.90      0.99      0.95       283
           1       0.89      0.36      0.52        47

    accuracy                           0.90       330
   macro avg       0.90      0.68      0.73       330
weighted avg       0.90      0.90      0.88       330

Offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       329
           1       0.00      0.00      0.00         1

    accuracy                           1.00       330
   macro avg       0.50      0.50      0.50       33

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### 6. Improve your model - use grid search 

In [12]:
# random forest grid search paramters
parameters = {
    'clf__estimator__criterion': ['gini','entropy'],
    'clf__estimator__max_features': ['sqrt','log2']
}

In [13]:
# initialise and fit grid search
cv = GridSearchCV(pipeline, param_grid=parameters)

model_gs = cv.fit(X_train,y_train)

In [14]:
# show best model parameters
model_gs.best_params_

{'clf__estimator__criterion': 'entropy',
 'clf__estimator__max_features': 'log2'}

### 7. Test model

In [171]:
# predict on test data and display results
y_pred_gs = cv.predict(X_test)

number_of_columns = y_test.shape[1]
for i in range(number_of_columns):
    print(df.columns[4+i].capitalize())
    print(classification_report(y_test[:,i],y_pred_gs[:,i]))

Related
              precision    recall  f1-score   support

           0       0.68      0.28      0.40        82
           1       0.79      0.96      0.87       246
           2       0.00      0.00      0.00         2

    accuracy                           0.78       330
   macro avg       0.49      0.41      0.42       330
weighted avg       0.76      0.78      0.74       330

Request
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       269
           1       0.81      0.28      0.41        61

    accuracy                           0.85       330
   macro avg       0.83      0.63      0.67       330
weighted avg       0.85      0.85      0.82       330

Offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       327
           1       0.00      0.00      0.00         3

    accuracy                           0.99       330
   macro avg       0.50      0.50      0.50       33

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### 8. Improve model

In [70]:
# define function to automate testing different classification models
def build_model(X, y, model, params = {}, test_size = 0.33):
    """
    This function takes in feature and target data, a classification algorithm to test and
    a dictionary of parameters for grid search to iterate through.
    It returns the average accuracy over the 36 categories.
    """
    start = time.time()
    
    pipeline = Pipeline([
        ('vect',CountVectorizer(tokenizer = tokenize)),
        ('tfidf',TfidfTransformer()),
        ('clf',MultiOutputClassifier(model()))    
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    cv = GridSearchCV(pipeline, param_grid=params)
    
    model = cv.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = 0
    for i in range(36):
        accuracy += accuracy_score(y_test[:,i],y_pred[:,i])
    
    avg_accuracy = accuracy/36
    
    display_results(y_test,y_pred)
    
    end = time.time()
    run_time = end-start
    print(f'time taken: {run_time}')
    
    return model, avg_accuracy, run_time

In [74]:
rf_test, rf_score, rf_run_time = build_model(X_sample,Y_sample, model = RandomForestClassifier, params = parameters)

Related
              precision    recall  f1-score   support

           0       0.62      0.15      0.24        67
           1       0.81      0.98      0.89       261
           2       0.00      0.00      0.00         2

    accuracy                           0.80       330
   macro avg       0.48      0.38      0.38       330
weighted avg       0.77      0.80      0.75       330

Request
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       271
           1       0.79      0.39      0.52        59

    accuracy                           0.87       330
   macro avg       0.84      0.68      0.72       330
weighted avg       0.86      0.87      0.85       330

Offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       329
           1       0.00      0.00      0.00         1

    accuracy                           1.00       330
   macro avg       0.50      0.50      0.50       33

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [50]:
# install and import xgboost
!pip install xgboost
from xgboost import XGBClassifier

Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 6.1 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1


In [71]:
# test xgboost classifier
xgb_test, xgb_score, xgb_run_time = build_model(X_sample,Y_sample, model = XGBClassifier)

Related
              precision    recall  f1-score   support

           0       0.40      0.23      0.29        73
           1       0.79      0.90      0.84       254
           2       0.00      0.00      0.00         3

    accuracy                           0.74       330
   macro avg       0.40      0.38      0.38       330
weighted avg       0.70      0.74      0.71       330

Request
              precision    recall  f1-score   support

           0       0.87      0.96      0.91       256
           1       0.80      0.50      0.62        74

    accuracy                           0.86       330
   macro avg       0.84      0.73      0.77       330
weighted avg       0.86      0.86      0.85       330

Offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       330

    accuracy                           1.00       330
   macro avg       1.00      1.00      1.00       330
weighted avg       1.00      1.00      1.00       33

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [112]:
class FirstWordIsVerb(BaseEstimator, TransformerMixin):
    """
    Define custom transformer that returns True if a body of text starts with a verb
    else it returns false
    """
    def starting_verb(self,text):
        pos_tags = pos_tag(tokenize(text))
        first_word, first_tag = pos_tags[0]
        if first_tag in ['VB','VBP']:
            return True
        return False
    
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [113]:
def build_model_2(X, y, model, params = {}, test_size = 0.33):
     """
    This function takes in feature and target data, a classification algorithm to test and
    a dictionary of parameters for grid search to iterate through.
    It returns the average accuracy over the 36 categories.
    It differs from the above function because it adds an extra feature - whether the first word
    is a verb.
    """
    start = time.time()
    pipeline = Pipeline([
        ('features', FeatureUnion([
                    ('text_pipeline',Pipeline([
                                        ('vect',CountVectorizer(tokenizer = tokenize)),
                                        ('tfidf',TfidfTransformer())    
                                            ])
                    ),
                     ('first_word_is_verb', FirstWordIsVerb())
                                        ])),
         ('clf',MultiOutputClassifier(model()))
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    cv = GridSearchCV(pipeline, param_grid=params)
    
    model = cv.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = 0
    for i in range(36):
        accuracy += accuracy_score(y_test[:,i],y_pred[:,i])
    
    avg_accuracy = accuracy/36
    
    display_results(y_test,y_pred)
    
    end = time.time()
    run_time = end-start
    print(f'time taken: {run_time}')
    
    return model, avg_accuracy, run_time

In [121]:
# train xgboost model on new pipeline
xgb_model_2, xgb_model_2_score, xgb_model_2_run_time = build_model_2(X_train,Y_train, model = XGBClassifier)

Related
              precision    recall  f1-score   support

           0       0.61      0.37      0.46        76
           1       0.82      0.93      0.87       250
           2       0.00      0.00      0.00         4

    accuracy                           0.79       330
   macro avg       0.48      0.43      0.44       330
weighted avg       0.76      0.79      0.77       330

Request
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       281
           1       0.60      0.49      0.54        49

    accuracy                           0.88       330
   macro avg       0.76      0.72      0.73       330
weighted avg       0.87      0.88      0.87       330

Offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       330

    accuracy                           1.00       330
   macro avg       1.00      1.00      1.00       330
weighted avg       1.00      1.00      1.00       33

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [122]:
xgb_model_2_score, xgb_model_1_score

(0.9441077441077442, 0.9384680134680133)

#### Comment
- New pipeline scores higher than old pipeline.

In [131]:
# use Randomized search to optimise xgboost parameters within the new pipeline
parameters = {'clf__estimator__learning_rate': [0.2, 0.3, 0.4],
              'clf__estimator__subsample': [0.8,0.9,1],
              'clf__estimator__min_child_weight': [1,2,3]
              }

pipeline = Pipeline([
        ('features', FeatureUnion([
                    ('text_pipeline',Pipeline([
                                        ('vect',CountVectorizer(tokenizer = tokenize)),
                                        ('tfidf',TfidfTransformer())    
                                            ])
                    ),
                     ('first_word_is_verb', FirstWordIsVerb())
                                        ])),
         ('clf',MultiOutputClassifier(XGBClassifier()))
    ])
    
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

cv = RandomizedSearchCV(pipeline, param_distributions=parameters)

model = cv.fit(X_train,y_train)

In [132]:
# define best model parameters
model.best_params_

{'clf__estimator__subsample': 1,
 'clf__estimator__min_child_weight': 1,
 'clf__estimator__learning_rate': 0.4}

In [133]:
# fit xgboost model according to best parameters
pipeline = Pipeline([
        ('features', FeatureUnion([
                    ('text_pipeline',Pipeline([
                                        ('vect',CountVectorizer(tokenizer = tokenize)),
                                        ('tfidf',TfidfTransformer())    
                                            ])
                    ),
                     ('first_word_is_verb', FirstWordIsVerb())
                                        ])),
         ('clf',MultiOutputClassifier(XGBClassifier(learning_rate=0.4, min_child_weight=1, subsample=1)))
    ])
    
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

model = pipeline.fit(X_train,y_train)

In [135]:
# test accuracy
y_pred = model.predict(X_test)

accuracy = 0
for i in range(36):
    accuracy += accuracy_score(y_test[:,i],y_pred[:,i])
    
avg_accuracy = accuracy/36

print(avg_accuracy)

0.9505541429085116


#### Comment
- Use the new pipeline, using an xgboost model and optimised parameters as final model.