# ML Pipeline

### Imports


In [6]:
import re
import pandas as pd
import numpy as np
from collections import Counter

import nltk
for package in ["punkt", "wordnet", "stopwords"]:
    nltk.download(package)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.externals import joblib

import lightgbm as lgbm

from sqlalchemy import create_engine

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kamilkrzyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamilkrzyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamilkrzyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Constants

In [7]:
DATA_INPUT_FILENAME = "Disasters.db"
SQL_TABLE_NAME = "DisastersData"
MODEL_OUTPUT_FILENAME = "model.pkl"
RANDOM_SEED = 1500

### Loading Data

In [8]:
engine = create_engine("sqlite:///{}".format(DATA_INPUT_FILENAME))
df_data = pd.read_sql_table("{}".format(SQL_TABLE_NAME), engine)
df_data.head()

Unnamed: 0,id,message,genre,category_related,category_request,category_offer,category_aid_related,category_medical_help,category_medical_products,category_search_and_rescue,...,category_aid_centers,category_other_infrastructure,category_weather_related,category_floods,category_storm,category_fire,category_earthquake,category_cold,category_other_weather,category_direct_report
0,2,Weather update - a cold front from Cuba that c...,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,direct,1,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Preparing datasets

In [9]:
X = df_data["message"]
Y = df_data[[col for col in df_data.columns if "category" in col]]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

### Tokenize function
- lower case
- token creation
- stopword reduction
- lemmatizing
- stemming
- removing extra blank lanes

In [10]:
def tokenize(text):
    raw_text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    
    tokens = word_tokenize(raw_text)
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = tok.strip()
        clean_tok = stemmer.stem(clean_tok)
        clean_tok = lemmatizer.lemmatize(clean_tok)
        clean_tokens.append(clean_tok)
        
    return clean_tokens

In [11]:
for message in df_data["message"][:20]:
    print(tokenize(message))

['weather', 'updat', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']
['hurrican']
['look', 'someon', 'name']
['un', 'report', 'leogan', 'destroy', 'hospit', 'st', 'croix', 'function', 'need', 'suppli', 'desper']
['say', 'west', 'side', 'haiti', 'rest', 'countri', 'today', 'tonight']
['inform', 'nation', 'palac']
['storm', 'sacr', 'heart', 'jesu']
['plea', 'need', 'tent', 'water', 'silo', 'thank']
['would', 'like', 'receiv', 'messag', 'thank']
['croix', 'de', 'bouquet', 'health', 'issu', 'worker', 'santo', 'area', 'croix', 'de', 'bouquet']
['noth', 'eat', 'water', 'starv', 'thirsti']
['petionvil', 'need', 'inform', 'regard']
['thomassin', 'number', 'area', 'name', 'pyron', 'would', 'like', 'water', 'thank', 'god', 'fine', 'desper', 'need', 'water', 'thank']
['let', 'togeth', 'need', 'food', 'delma', 'didin', 'area']
['inform', 'number', 'order', 'particip', 'see', 'use']
['comite', 'delma', 'rue', 'street', 'janvier', 'impass', 'charit', 'peopl', 'temporari', 'shelter', 'dire', 'need'

In [12]:
all_tokens = []
for message in df_data["message"]:
    all_tokens.extend(tokenize(message))

In [13]:
word_counts = Counter(all_tokens)
word_counts.most_common()

[('water', 3046),
 ('peopl', 3017),
 ('food', 2904),
 ('help', 2858),
 ('need', 2755),
 ('plea', 2086),
 ('u', 1927),
 ('earthquak', 1923),
 ('area', 1667),
 ('like', 1635),
 ('would', 1491),
 ('flood', 1439),
 ('said', 1351),
 ('http', 1257),
 ('countri', 1251),
 ('thank', 1175),
 ('know', 1119),
 ('govern', 1116),
 ('also', 1114),
 ('hous', 1063),
 ('rain', 1060),
 ('haiti', 1044),
 ('inform', 1043),
 ('one', 1022),
 ('work', 1000),
 ('live', 995),
 ('find', 971),
 ('year', 957),
 ('sandi', 931),
 ('provid', 928),
 ('tent', 886),
 ('affect', 886),
 ('go', 882),
 ('get', 870),
 ('includ', 869),
 ('aid', 864),
 ('famili', 854),
 ('region', 852),
 ('nation', 847),
 ('suppli', 844),
 ('good', 840),
 ('relief', 813),
 ('child', 812),
 ('commun', 790),
 ('day', 785),
 ('two', 774),
 ('health', 753),
 ('report', 729),
 ('distribut', 709),
 ('messag', 697),
 ('well', 697),
 ('mani', 693),
 ('villag', 692),
 ('school', 691),
 ('give', 681),
 ('use', 667),
 ('caus', 663),
 ('state', 657),
 ('c

### Verification metric

In [16]:
def get_classification_result(Y_expected, Y_pred, class_names):
    def _metric_per_class(Y_expected, Y_pred, metric):
        return [metric(Y_expected.iloc[:, i], Y_pred[:, i], average="weighted") for i in range(Y_pred.shape[1])]

    metrics = {}
    for metric_name, metric in zip(["f1_score", "precision", "recall"], [f1_score, precision_score, recall_score]):
        metrics[metric_name] = _metric_per_class(Y_expected, Y_pred, metric)

    df_metrics = pd.DataFrame(data=metrics)
    df_metrics.index = class_names
    return df_metrics

### Baseline model
Model setup

In [17]:
baseline = Pipeline([   
    ("text_pipeline", Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer())
    ])),
    ("clf", MultiOutputClassifier(DummyClassifier()))
])

Model training

In [18]:
baseline.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('text_pipeline', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1...=DummyClassifier(constant=None, random_state=None, strategy='stratified'),
           n_jobs=None))])

Model Result

In [19]:
Y_pred = baseline.predict(X_test)
baseline_results = get_classification_result(Y_test, Y_pred, Y.columns)

In [20]:
baseline_results

Unnamed: 0,f1_score,precision,recall
category_related,0.644204,0.641042,0.64753
category_request,0.712881,0.713193,0.712569
category_offer,0.992078,0.993506,0.990654
category_aid_related,0.505997,0.50514,0.506962
category_medical_help,0.860647,0.863035,0.858287
category_medical_products,0.909179,0.908011,0.910357
category_search_and_rescue,0.946094,0.944079,0.948121
category_security,0.961068,0.96181,0.960328
category_military,0.930415,0.928932,0.931909
category_water,0.884315,0.886725,0.881938


In [22]:
baseline_results.mean()

f1_score     0.875476
precision    0.875798
recall       0.875186
dtype: float64

### More Capable Model #1

Model setup

In [23]:
model_basic = Pipeline([   
    ("text_pipeline", Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer())
    ])),
    ("clf", MultiOutputClassifier(RandomForestClassifier(random_state=RANDOM_SEED)))
])

Model training

In [24]:
model_basic.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('text_pipeline', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1..._score=False, random_state=1500, verbose=0,
            warm_start=False),
           n_jobs=None))])

Model results

In [28]:
Y_pred = model_basic.predict(X_test)
model_basic_results = get_classification_result(Y_test, Y_pred, [c[9:] for c in Y.columns])

In [29]:
model_basic_results

Unnamed: 0,f1_score,precision,recall
related,0.792849,0.790176,0.804501
request,0.877233,0.882903,0.889376
offer,0.995139,0.993526,0.996758
aid_related,0.75032,0.754169,0.754911
medical_help,0.902811,0.901822,0.92695
medical_products,0.934235,0.939363,0.952317
search_and_rescue,0.960288,0.964226,0.972344
security,0.971471,0.968441,0.980545
military,0.94894,0.95032,0.96338
water,0.951797,0.953678,0.95823


In [30]:
model_basic_results.mean()

f1_score     0.931911
precision    0.932438
recall       0.943974
dtype: float64

### More Capable Model #2

Model setup

In [31]:
model_basic_tuned = Pipeline([   
    ("text_pipeline", Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer())
    ])),
    ("clf", MultiOutputClassifier(RandomForestClassifier()))
])

model_basic_tuned_parameters = {
    "clf__estimator__max_depth": [None, 5, 15, 25],
    "clf__estimator__min_samples_split": [2, 3, 4],
    "clf__estimator__random_state": [RANDOM_SEED]
}

model_basic_tuned_cv = GridSearchCV(model_basic_tuned, param_grid=model_basic_tuned_parameters, verbose=2)

Model training

In [32]:
model_basic_tuned_cv.fit(X_train, Y_train)
model_basic_tuned_cv.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] clf__estimator__max_depth=None, clf__estimator__min_samples_split=2, clf__estimator__random_state=1500 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__max_depth=None, clf__estimator__min_samples_split=2, clf__estimator__random_state=1500, total= 1.7min
[CV] clf__estimator__max_depth=None, clf__estimator__min_samples_split=2, clf__estimator__random_state=1500 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.8min remaining:    0.0s


[CV]  clf__estimator__max_depth=None, clf__estimator__min_samples_split=2, clf__estimator__random_state=1500, total= 1.7min
[CV] clf__estimator__max_depth=None, clf__estimator__min_samples_split=2, clf__estimator__random_state=1500 
[CV]  clf__estimator__max_depth=None, clf__estimator__min_samples_split=2, clf__estimator__random_state=1500, total= 1.6min
[CV] clf__estimator__max_depth=None, clf__estimator__min_samples_split=3, clf__estimator__random_state=1500 
[CV]  clf__estimator__max_depth=None, clf__estimator__min_samples_split=3, clf__estimator__random_state=1500, total= 1.5min
[CV] clf__estimator__max_depth=None, clf__estimator__min_samples_split=3, clf__estimator__random_state=1500 
[CV]  clf__estimator__max_depth=None, clf__estimator__min_samples_split=3, clf__estimator__random_state=1500, total= 1.5min
[CV] clf__estimator__max_depth=None, clf__estimator__min_samples_split=3, clf__estimator__random_state=1500 
[CV]  clf__estimator__max_depth=None, clf__estimator__min_samples_sp

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 78.6min finished


Pipeline(memory=None,
     steps=[('text_pipeline', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1..._score=False, random_state=1500, verbose=0,
            warm_start=False),
           n_jobs=None))])

Model results

In [36]:
Y_pred = model_basic_tuned_cv.predict(X_test)
model_basic_tuned_results = get_classification_result(Y_test, Y_pred, [c[9:] for c in Y.columns])

In [37]:
model_basic_tuned_results

Unnamed: 0,f1_score,precision,recall
related,0.792849,0.790176,0.804501
request,0.877233,0.882903,0.889376
offer,0.995139,0.993526,0.996758
aid_related,0.75032,0.754169,0.754911
medical_help,0.902811,0.901822,0.92695
medical_products,0.934235,0.939363,0.952317
search_and_rescue,0.960288,0.964226,0.972344
security,0.971471,0.968441,0.980545
military,0.94894,0.95032,0.96338
water,0.951797,0.953678,0.95823


In [38]:
model_basic_tuned_results.mean()

f1_score     0.931911
precision    0.932438
recall       0.943974
dtype: float64

### Advanced Model

Model setup

In [47]:
model_lgbm = Pipeline([
    ("text_pipeline", Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer())
    ])),
    ("clf", MultiOutputClassifier(lgbm.LGBMClassifier()))
])

parameters = {
    "text_pipeline__vect__ngram_range": ((1, 1), (1, 2)),
    "text_pipeline__vect__max_df": (0.75, 1.0),
    "text_pipeline__vect__max_features": (None, 5000),
    "clf__estimator__num_leaves": [5, 10, 20],
    "clf__estimator__max_depth": [10, 15, 25],
    "clf__estimator__seed": [RANDOM_SEED]
}

model_lgbm_cv = GridSearchCV(model_lgbm, param_grid=parameters, verbose=2)

Model training

In [48]:
model_lgbm_cv.fit(X_train, Y_train)
model_lgbm_cv.best_estimator_

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.6min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.5min remaining:    0.0s


[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.6min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.7min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__

[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.4min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.4min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect_

[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 2), total= 2.0min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1), total= 1.5min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__

[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.7min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.8min
[CV] clf__estimator__max_depth=10, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=10, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__

[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.4min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.4min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__

[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 2), total= 2.8min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1), total= 1.5min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipel

[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.7min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.6min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipel

[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.6min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.7min
[CV] clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=15, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__

[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 2), total= 1.6min
[CV] clf__estimator__max_depth=25, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1), total= 1.4min
[CV] clf__estimator__max_depth=25, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=5, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect_

[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.5min
[CV] clf__estimator__max_depth=25, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1) 
[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 1), total= 1.5min
[CV] clf__estimator__max_depth=25, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__vect__max_features=None, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=10, clf__estimator__seed=1500, text_pipeline__vect__max_df=1.0, text_pipeline__

[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.7min
[CV] clf__estimator__max_depth=25, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2), total= 1.7min
[CV] clf__estimator__max_depth=25, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipeline__vect__max_features=5000, text_pipeline__vect__ngram_range=(1, 2) 
[CV]  clf__estimator__max_depth=25, clf__estimator__num_leaves=20, clf__estimator__seed=1500, text_pipeline__vect__max_df=0.75, text_pipel

[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed: 549.9min finished


Pipeline(memory=None,
     steps=[('text_pipeline', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, ...=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0),
           n_jobs=None))])

Model results

In [40]:
Y_pred = model_lgbm_cv.predict(X_test)
model_lgbm_results = get_classification_result(Y_test, Y_pred, [c[9:] for c in Y.columns])

In [41]:
model_lgbm_results

Unnamed: 0,f1_score,precision,recall
related,0.82235,0.83045,0.83807
request,0.90844,0.909965,0.91379
offer,0.99882,0.998805,0.998856
aid_related,0.809726,0.811906,0.81194
medical_help,0.938891,0.941192,0.947358
medical_products,0.966633,0.969808,0.9712
search_and_rescue,0.975727,0.977441,0.979783
security,0.979746,0.981891,0.984169
military,0.980901,0.981643,0.982644
water,0.98034,0.980155,0.980736


In [42]:
model_lgbm_results.mean()

f1_score     0.956492
precision    0.958807
recall       0.960884
dtype: float64

#### Test

In [43]:
result = model_lgbm_cv.predict(["I like pancakes!"])
indices = [i for i, v in enumerate(result.ravel()) if v == 1]
np.array([c[9:] for c in Y.columns])[indices]

array([], dtype='<U22')

In [44]:
result = model_lgbm_cv.predict(["There was a fire at my house. We need medical aid!"])
indices = [i for i, v in enumerate(result.ravel()) if v == 1]
np.array([c[9:] for c in Y.columns])[indices]

array(['related', 'request', 'aid_related', 'medical_help',
       'medical_products', 'weather_related', 'fire'], dtype='<U22')

In [51]:
result = model_lgbm_cv.predict(["The whole town is flooded. " +
                                "It keeps raining and we are stuck on the roof freezing cold."])
indices = [i for i, v in enumerate(result.ravel()) if v == 1]
np.array([c[9:] for c in Y.columns])[indices]

array(['related', 'buildings', 'weather_related', 'floods', 'storm',
       'cold'], dtype='<U22')

### Saving model

In [52]:
joblib.dump(model_lgbm_cv, MODEL_OUTPUT_FILENAME)

['model.pkl']