## ML Pipeline

What it does...


### Import libraries and load data

In [1]:
# import libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

nltk.download('wordnet')
nltk.download('punkt') 
nltk.download('stopwords') 
# nltk.download('corpus')


from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize, punkt
from nltk.stem import WordNetLemmatizer


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
# from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier


from sklearn.pipeline import Pipeline
from sqlalchemy import create_engine

# a static value to detect hyperlinks
URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
STOP_WORDS = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JakubBelow\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JakubBelow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JakubBelow\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load datasets
engine = create_engine('sqlite:///DB/disaster_messages.db')
df = pd.read_sql('DB/disaster_messages', con=engine)

In [3]:
df.head(1)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Process data and engineer features

One class does not have any instances in the dataset. It will be removed to 

In [4]:
# remove the 'child_alone' feature that doesn't seem to appear at all in the dataset
features = df.loc[:,'related':].columns.to_list()
features.remove('child_alone')

In [5]:
def tokenize(text):
    """
    Desc: Returns cleaned and lemmatized tokens from a text to be used by an NLP vectorizer
    
        Parameters:
            text (str): a document to be processed (e.g. a twitter message)
        Returns:
            clean_tokens (list[str]): a list of cleaned and lemmatized word tokens
    """
    # 
    urls = re.findall(URL_REGEX, text)
    
    for url in urls:
        text = text.replace(url, '<url>')
        
    #
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    #
    clean_tokens = [lemmatizer.lemmatize(tok).lower().strip() for tok in tokens]
    clean_tokens = [tok for tok in clean_tokens if tok not in STOP_WORDS]
        
    return clean_tokens

In [6]:
# define X and y
X = df['message']
y = df[features]

In [7]:
len(features)

35

### Build the pipeline

In [8]:
# 
def model_pipeline():
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize, token_pattern=None)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=MultinomialNB()))
    ])
    
    return pipeline

pipeline = model_pipeline()

### Train the pipeline

In [9]:
%%time
# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

# train classifier
pipeline.fit(X_train, y_train)

CPU times: total: 3.34 s
Wall time: 7.95 s


In [10]:
# w/ all features -> CPU times: total: 16min 33s; Wall time: 37min 4s
# w/o child alone -> CPU times: total: 17min 50s; Wall time: 35min 35s  -- WTF!!
# w/o stop words -> CPU times: total: 17min 42sWall time: 41min 38s
# model 1 (RandomForestClassifier()) -> CPU times: total: 17min 42sWall time: 41min 38s
# model 2 (MultinomialNB()) -> CPU times: total: 3.16 s Wall time: 6.23 s
# model 3 (SGDClassifier()) -> CPU times: total: 3.09 s Wall time: 7.25 s
# model 4 (LogisticRegression()) -> CPU times: total: 7.84 s Wall time: 30.1 s

In [90]:
# y_pred = pipeline.predict(X_test)
# pred_df = pd.DataFrame(y_pred, columns=y_test.columns)

In [91]:
# report_df = pd.DataFrame(columns=['precision', 'recall', 'f1-score'])
# report_df

In [92]:
# pred_df.columns
# report_df.index

In [93]:
# # TODO: create a function

# for col in pred_df.columns:
#     scores = classification_report(y_test[col], pred_df[col], output_dict=True)['weighted avg']
#     precision, recall, f1_score, _ = [score for score in scores.values()]
#     report_df.loc[len(report_df)] = [precision, recall, f1_score]

# report_df.index = pred_df.columns
# report_df

In [None]:
('clf', MultiOutputClassifier(estimator=MultinomialNB()))

The f1 score returns a warning for some classes, the reason being lack of thereof in predictions. 

In [11]:
def model_pipeline(model):
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize, token_pattern=None)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=model))
    ])
    
    return pipeline

# pipeline = model_pipeline()

In [12]:
# In case it's necessary to test m

models = [
    LogisticRegression(max_iter=1000),
    MultinomialNB(),
    SGDClassifier()
]

for model in models:
    pipeline = model_pipeline(model)
    print(f'{model}:\n')
    # train classifier
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    pred_df = pd.DataFrame(y_pred, columns=y_test.columns)
    report_df = pd.DataFrame(columns=['precision', 'recall', 'f1-score'])

    for col in pred_df.columns:
        scores = classification_report(y_test[col], pred_df[col], output_dict=True, zero_division=0)['weighted avg']
        precision, recall, f1_score, _ = [score for score in scores.values()]
        report_df.loc[len(report_df)] = [precision, recall, f1_score]

    report_df.index = pred_df.columns
    report_df
    
    print('success:\n', report_df.mean(), '\n\n')
    

LogisticRegression(max_iter=1000):

success:
 precision    0.937788
recall       0.946578
f1-score     0.935932
dtype: float64 


MultinomialNB():

success:
 precision    0.905353
recall       0.932952
f1-score     0.908464
dtype: float64 


SGDClassifier():

success:
 precision    0.939006
recall       0.948371
f1-score     0.937878
dtype: float64 




SGDClassifier seems to pereform the best. Since we only have binary features, the feature scaling is not a problem here.

There are values that don't appear in the test dataset and/or are not ever predicted. This is caused by the dataset being imbalanced (i.e. there are drastically less instances of a certain class than other classess). One way to combat this issue is to oversample minority classes by creating synthetic examples.

TODO: show counts of predicted classess

TODO: show counts earlier

TODO: consider removing some features that are not even in the dataset - do it all earlier

In [107]:
# # Use grid search to find better parameters.
# parameters = {'clf__estimator__n_estimators': [50, 100],
#               'clf__estimator__max_depth':[30, 50],
#               'clf__estimator__min_samples_split': [3, 4]
#              }

# cv = GridSearchCV(pipeline, param_grid=parameters)

In [20]:
%%time
# Use grid search to find better parameters.
# parameters = {'clf__estimator__n_estimators': [50, 100],
#               'clf__estimator__max_depth':[30, 50],
#               'clf__estimator__min_samples_split': [3, 4]
#              }
# Use grid search to find better parameters.

pipeline = model_pipeline(SGDClassifier())
parameters = {#'clf__estimator__penalty' : ['l1', 'l2', 'elasticnet'],
#               'clf__estimator__C' : np.logspace(-4, 4, 20),
#               'clf__estimator__solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
                'clf__estimator__loss': ['hinge', 'log_loss', 'squared_hinge', 'perceptron']
#               'clf__estimator__max_iter' : [100, 500, 1000]
             }


cv = GridSearchCV(pipeline, param_grid=parameters)
best_clf = cv.fit(X_train, y_train)







CPU times: total: 1min 49s
Wall time: 5min 6s


In [None]:
# {'clf__estimator__penalty': 'l2'}   ->    CPU times: total: 2min 40s           Wall time: 11min
# {'clf__estimator__max_iter': 500}   ->    CPU times: total: 54.6 s             Wall time: 1min 54s


In [19]:
best_clf.best_params_

{'clf__estimator__max_iter': 500}

TODO: dodać liczbę przewidywań klas

The model does not always predict the same class, however XXX out of 35 classes are never predicted for the training data set. This is a potential pain point for the next iteration of the model. Oversampling might possibly help with the issue. Otherwise, a business decision may be made to forfeit these features for the time being altogether.


In [None]:
# Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(pipeline, parameters)

# Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train,y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_ 

In [108]:
# test the model
cv.fit(X_train, y_train)

ValueError: Invalid parameter 'max_depth' for estimator LogisticRegression(max_iter=1000). Valid parameters are: ['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'].

In [None]:
# get the scores in the df
df = pd.Dataframe(grid.cv_results_)

In [195]:

print(classification_report(y_test, pipeline.predict(X_test)))

ValueError: multiclass-multioutput is not supported

In [90]:
pipeline.predict, X_test)

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)