# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import itertools
import re
import nltk
import xgboost as xgb
from sqlalchemy import create_engine
from nltk.corpus import stopwords as nl_stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from nltk.stem import WordNetLemmatizer
  
nltk.download('punkt')
nltk.download('stopwords')

import gensim

import dill
import pickle
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amatamune\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amatamune\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# load data
engine = create_engine('sqlite:///database/disaster_response.db')
df = pd.read_sql_table('categorized_messages', con=engine)
Y = df.drop(['id','message','original','genre',
            'related','request','offer','aid_related','direct_report',
            ], axis=1)

#Get service counts
service_cnts = Y.sum(axis=0).sort_values(ascending=False)
requests_and_offers = df[['request','offer']].sum(axis=0)
top10 = service_cnts.iloc[:10]
print(requests_and_offers)

request    4464
offer       118
dtype: int64


In [9]:
top10.values

array([7286, 3441, 2917, 2452, 2440, 2308, 2149, 2081, 1705, 1669],
      dtype=int64)

### Installing missing dependencies:

In [None]:
#!pip install gensim

In [None]:
#!pip install xgboost

In [None]:
#!pip install dill

In [None]:
!pip install --upgrade pandas

In [None]:
!pip install --upgrade sklearn

In [3]:
# Load data from database
engine = create_engine('sqlite:///database/disaster_response.db')
df = pd.read_sql_table('categorized_messages', con=engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Separate into explanotary variable (tweet text) 
# and the target (message classes)
# We also save the genre, although I ended up not using it.

X_text = df['message']
X_genre = df['genre']
Y = df.drop(['id','message','original','genre'], axis=1)

### Take a look at the data if necessary:

In [4]:
X_text.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [5]:
Y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process your text data

In [5]:
def tokenize(text,stopwords=None):
    """
    Function performs basic tokenization:
    1. Conversion to lowercase
    2. Removal of special characters
    3. Tokenization using NLTK
    4. Removal of stopwords
    
    Args:
    text (str): text to be tokenized
    
    Out:
    words (list): a list of tokens
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]'," ",text)
    words = nltk.word_tokenize(text)
    if stopwords:
        words = [w for w in words if w not in stopwords]
    
    return words
    

In [6]:
# Test that it works
stopwords = nl_stopwords.words('english')
tokenize(X_text.iloc[0], stopwords)

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pass', 'haiti']

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [6]:
# The child-alone category is 0 for all observations. 
# We should remove it and then just predict it to be 0 independent of text.

#Check for redundant variables in the class list - this is how we find child-alone
Y.sum()

related                   20252
request                    4464
offer                       118
aid_related               10841
medical_help               2081
medical_products           1311
search_and_rescue           724
security                    471
military                    859
child_alone                   0
water                      1669
food                       2917
shelter                    2308
clothing                    404
money                       603
missing_people              298
refugees                    874
death                      1192
other_aid                  3441
infrastructure_related     1705
transport                  1199
buildings                  1331
electricity                 532
tools                       159
hospitals                   283
shops                       120
aid_centers                 309
other_infrastructure       1151
weather_related            7286
floods                     2149
storm                      2440
fire    

In [7]:
Y = Y.drop(['child_alone'], axis=1)

In [8]:
# Resetting indeces is necessary to avoid
# ValueError: WRITEBACKIFCOPY base is read-only
X_text.reset_index()
Y.reset_index()

Unnamed: 0,index,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26175,26175,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26176,26176,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26177,26177,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26178,26178,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Form the sets
X_train, X_test, Y_train, Y_test = train_test_split(X_text, Y, test_size=0.2, random_state=42)

In [10]:
# Check dimensions
print(X_train.shape)
print(Y_train.shape)

(20944,)
(20944, 35)


In [129]:
# We'll be using XGBoost as the backbone of the MultiOutputClassifier
# It shows better results taht Random Forest and scikit-learn basic gradient boosting
pipeline_basic = Pipeline([
    ('vec',CountVectorizer(tokenizer=lambda x: tokenize(x, stopwords))),
    ('tfidf',TfidfTransformer()),
    
    #('clf',MultiOutputClassifier(estimator=RandomForestClassifier(random_state=42)))
    #('clf',MultiOutputClassifier(estimator=GradientBoostingClassifier(random_state=42)))
    ('clf',MultiOutputClassifier(estimator=xgb.XGBClassifier(random_state=42,n_estimators=100,subsample=1)))
    
])

In [130]:
pipeline_basic.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...le_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
           n_jobs=None))])

In [131]:
pipeline_basic.steps[2][1].estimator

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [138]:
Y_pred = pipeline_basic.predict(X_test)

In [139]:
# Note: classification report isn't very helpful
# we'll just output the f1-score and use that to judge model quality.
# If we had more time to put into this project,
# AUC would also be a great candidate to base model selection on
for ind, col in enumerate(list(Y_test.columns)):
    #print(ind)
    y_test = list(Y_test.iloc[:,ind])
    y_pred = list(Y_pred[:,ind])
    #print(y_test)
    #print(y_pred)
    #break
    print(col)
    try:
        print('F1-score is {}'.format(f1_score(y_test, y_pred)))
    except:
        print('F1-score is {}'.format(f1_score(y_test, y_pred, average='weighted')))
    #print(classification_report(y_test,y_pred))

0
related
F1-score is 0.7049406548565185
1
request
F1-score is 0.5802382620882972
2
offer
F1-score is 0.0
3
aid_related
F1-score is 0.6330275229357798
4
medical_help
F1-score is 0.27016885553470915
5
medical_products
F1-score is 0.4089635854341736
6
search_and_rescue
F1-score is 0.16666666666666669
7
security
F1-score is 0.018867924528301886
8
military
F1-score is 0.2880658436213992
9
water
F1-score is 0.717041800643087
10
food
F1-score is 0.7845884413309983
11
shelter
F1-score is 0.6219839142091154
12
clothing
F1-score is 0.5161290322580645
13
money
F1-score is 0.35802469135802467
14
missing_people
F1-score is 0.31746031746031744
15
refugees
F1-score is 0.2834008097165992
16
death
F1-score is 0.5351351351351351
17
other_aid
F1-score is 0.12403100775193798
18
infrastructure_related
F1-score is 0.07124681933842239
19
transport
F1-score is 0.3257328990228013
20
buildings
F1-score is 0.3958333333333334
21
electricity
F1-score is 0.3333333333333333
22
tools
F1-score is 0.0
23
hospitals
F1-

### 6. Improve your model
Use grid search to find better parameters. 

In [67]:
# Set up a simple parameter grid. Even this will require training 6 models
parameters = {
    'clf__estimator__n_estimators': [100, 200],
    'clf__estimator__subsample': [0.8, 1],
    'clf__estimator__max_depth': [2, 3, 4]
}

# See what we wind up with
cv_basic = GridSearchCV(pipeline_basic, param_grid=parameters, cv=3, n_jobs=6)

In [69]:
cv_basic.fit(X_train, Y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...le_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
           n_jobs=None))]),
       fit_params=None, iid='warn', n_jobs=6,
       param_grid={'clf__estimator__n_estimators': [100, 200], 'clf__estimator__subsample': [0.8, 1], 'clf__estimator__max_depth': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [70]:
# Find the model choice we settled on
cv_basic.best_estimator_

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_..._pos_weight=1, seed=None, silent=None,
       subsample=0.8, verbosity=1),
           n_jobs=None))])

In [71]:
# Find the estimator parameters we settled on
cv_basic.best_estimator_.steps[2][1].estimator

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=0.8, verbosity=1)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [72]:
# Test the results
Y_cv = cv_basic.predict(X_test)

In [151]:
# Again we focus on the F1-scores. Classification report
# is uncommented only for submission as it isn't very useful.
cv_basic_fscores = {}
for ind, col in enumerate(list(Y_test.columns)):
    y_test = list(Y_test.iloc[:,ind])
    y_cv = list(Y_cv[:,ind])
    #print(y_test)
    #print(y_pred)
    #break
    print(col)
    try:
        print('F1-score is {}'.format(f1_score(y_test, y_cv)))
        cv_basic_fscores[col] = f1_score(y_test, y_cv)
    except:
        pass
    #print(classification_report(y_test,y_pred))

related
request
F1-score is 0.6345646437994723
offer
F1-score is 0.0
aid_related
F1-score is 0.6767151767151767
medical_help
F1-score is 0.3395585738539898
medical_products
F1-score is 0.4690721649484536
search_and_rescue
F1-score is 0.23958333333333334
security
F1-score is 0.0900900900900901
military
F1-score is 0.4229390681003584
water
F1-score is 0.7320872274143303
food
F1-score is 0.7829937998228522
shelter
F1-score is 0.6674937965260546
clothing
F1-score is 0.5079365079365079
money
F1-score is 0.42458100558659223
missing_people
F1-score is 0.3636363636363636
refugees
F1-score is 0.3308270676691729
death
F1-score is 0.5824742268041238
other_aid
F1-score is 0.2028639618138425
infrastructure_related
F1-score is 0.14285714285714285
transport
F1-score is 0.36923076923076925
buildings
F1-score is 0.4666666666666667
electricity
F1-score is 0.40789473684210525
tools
F1-score is 0.06896551724137931
hospitals
F1-score is 0.14084507042253522
shops
F1-score is 0.0
aid_centers
F1-score is 0.11

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

### Add more regularization
The first things we can do is add column subsampling for different trees. This tends to make sure that the model gets all it can from the less significant features.

In [75]:
# Advanced pipeline
# add colsample_bytree
pipeline_advanced = Pipeline([
    ('vec',CountVectorizer(tokenizer=lambda x: tokenize(x, stopwords))),
    ('tfidf',TfidfTransformer()),
    ('clf',MultiOutputClassifier(estimator=xgb.XGBClassifier(
        random_state=42,n_estimators=200,subsample=0.8,max_depth=4,learning_rate=0.1,colsample_bytree=0.4)))
])

In [76]:
pipeline_advanced.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_..._pos_weight=1, seed=None,
       silent=None, subsample=0.8, verbosity=1),
           n_jobs=None))])

In [77]:
#Check results
Y_pred = pipeline_advanced.predict(X_test)
advanced_fscores = {}
for ind, col in enumerate(list(Y_test.columns)):
    y_test = list(Y_test.iloc[:,ind])
    y_pred = list(Y_pred[:,ind])
    y_score = list(Y_score[ind][:,1])
    #print(y_test)
    #print(y_pred)
    #break
    print(col)
    try:
        advanced_fscores[col] = f1_score(y_test, y_pred)
    except:
        advanced_fscores[col] = f1_score(y_test, y_pred, average='weighted')
    print('F1-score is {}'.format(advanced2_fscores[col]))    
       
    #print(classification_report(y_test,y_pred))

related
F1-score is 0.751148734275228
request
F1-score is 0.6221928665785997
offer
F1-score is 0.0
aid_related
F1-score is 0.6758656599843791
medical_help
F1-score is 0.34529914529914535
medical_products
F1-score is 0.4578947368421052
search_and_rescue
F1-score is 0.20105820105820105
security
F1-score is 0.07207207207207207
military
F1-score is 0.38686131386861317
water
F1-score is 0.725
food
F1-score is 0.7915194346289752
shelter
F1-score is 0.6741293532338307
clothing
F1-score is 0.53125
money
F1-score is 0.4285714285714286
missing_people
F1-score is 0.2903225806451613
refugees
F1-score is 0.32452830188679244
death
F1-score is 0.5744125326370757
other_aid
F1-score is 0.18465227817745805
infrastructure_related
F1-score is 0.1108433734939759
transport
F1-score is 0.3682539682539683
buildings
F1-score is 0.47596153846153855
electricity
F1-score is 0.42105263157894735
tools
F1-score is 0.0
hospitals
F1-score is 0.028985507246376815
shops
F1-score is 0.0
aid_centers
F1-score is 0.12048192

  'precision', 'predicted', average, warn_for)


### Conclusion
The F1-scores are better pretty much across the board. From now on, we'll continue to use column subsampling. But there is more to be done if we want to improve F1-scores.

## The importance of balance
Our classes are unbalanced across the board. The ratio is different everywhere but it's, in general, something like this:

In [78]:
Y_train['water'].value_counts()

0    19631
1     1313
Name: water, dtype: int64

What we need is to give more weight to the positives. Currently, our model has excellent precision and bad recall. But we can give up a bit of precision (have responders discard some messages that do get to them) than it is to miss messages in the noise.
I decided to assign the positive observations weight=3. Frankly, this parameter should be properly tuned, but this is outside the scope of this project.

In [24]:
# Advanced pipeline
# add colsample_bytree
pipeline_advanced2 = Pipeline([
    ('vec',CountVectorizer(tokenizer=lambda x: tokenize(x, stopwords))),
    ('tfidf',TfidfTransformer()),
    ('clf',MultiOutputClassifier(estimator=xgb.XGBClassifier(
        random_state=42,n_estimators=200,subsample=0.8,max_depth=4,
        learning_rate=0.1,colsample_bytree=0.4,scale_pos_weight=3)))
])

In [25]:
pipeline_advanced2.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_..._pos_weight=3, seed=None,
       silent=None, subsample=0.8, verbosity=1),
           n_jobs=None))])

In [27]:
#Check results
Y_pred = pipeline_advanced2.predict(X_test)
advanced2_fscores = {}
for ind, col in enumerate(list(Y_test.columns)):
    y_test = list(Y_test.iloc[:,ind])
    y_pred = list(Y_pred[:,ind])
    #print(y_test)
    #print(y_pred)
    #break
    print(col)
    try:
        advanced2_fscores[col] = f1_score(y_test, y_pred)
    except:
        advanced2_fscores[col] = f1_score(y_test, y_pred, average='weighted')
    print('F1-score is {}'.format(advanced2_fscores[col]))    
       
    #print(classification_report(y_test,y_pred))

related
F1-score is 0.751148734275228
request
F1-score is 0.6720430107526881
offer
F1-score is 0.0625
aid_related
F1-score is 0.7156549520766773
medical_help
F1-score is 0.4718792866941015
medical_products
F1-score is 0.52
search_and_rescue
F1-score is 0.3113207547169811
security
F1-score is 0.1
military
F1-score is 0.5031847133757962
water
F1-score is 0.7714285714285714
food
F1-score is 0.79232693911593
shelter
F1-score is 0.7085590465872155
clothing
F1-score is 0.5401459854014599
money
F1-score is 0.4642857142857142
missing_people
F1-score is 0.3768115942028985
refugees
F1-score is 0.419672131147541
death
F1-score is 0.641255605381166
other_aid
F1-score is 0.4251357641582622
infrastructure_related
F1-score is 0.22633744855967078
transport
F1-score is 0.42896935933147634
buildings
F1-score is 0.5526838966202783
electricity
F1-score is 0.5196078431372549
tools
F1-score is 0.06451612903225806
hospitals
F1-score is 0.1951219512195122
shops
F1-score is 0.05405405405405405
aid_centers
F1-s

Again, we have improved across most classes. There is one last improvement I'd like to make before moving on from this project.
Hope you find it fun to analyze.

## Word2Vec incorporation - outline
My strategy was to try to move from pure bag-of-words to something that also takes into account the meaning of words and sentence length/punctuation.
To measure emotional distress I calculate sentence length statistics for tweets and also the number of various punctuation marks.
To associate meaning with tokens, I use GoogleNews pretrained Word2Vec model. Then I clusterized all the words in the training corpus using KMeans. I then calculate three most prominent clusters within each tweet (if two clusters have the same number of members in the tweet, then the cluster that has lower global frequency is given priority).

# Warning to the reviewer
If you are reviewing this work, then I recommend that you don't run the following section, as it requires you to download the `GoogleNews-vectors-negative300.bin`, which is larger than 3GB. I'm quite proud of the idea and that it worked, but reading the source code should suffice.
If you'd like to test the model, just skip down and load the pickled model, see `pipeline_advanced4 = ...`
You can then test it.
If you do want to download Google's word2Vec, then it's available [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)

In [10]:
#Load in word2vec
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./.word2vec/GoogleNews-vectors-negative300.bin',binary=True)

In [11]:
#The goal is to create a mapping that maps a word to a word cluster
#We'll later use the three most prominent cluster numbers as factors
#and try to improve the model this way

#First we need to get all the words into one array
all_words = []
i=0
for sent in X_text:
    all_words += tokenize(sent)

In [12]:
#Now we map words to vectors
unique_words = set(all_words)
unique_words.update('supercollider')
unique_words = list(unique_words)

In [13]:
#Allow words to not be found
def get_vector_func(w2v, w_placeholder='supercollider'):
    """
    The function sets the Word2Vec model
    for the inner get_vector function
    and returns it.
    
    Args:
    w2v (Word2VecKeyedVectors) - a Word2Vec model
    w_placeholder (str) - the word that we'll replace
        missing words with. Doesn't matter what it is
        as long as it's rare and has nothing to do
        with natural disasters
    
    Out:
    try_get_vector (func) - a function that allows
    words tobe missing from the vocabulary
    """
    def try_get_vector(word):
        """
        This inner function implements exception handling
        for Word2VecKeyedVectors.get_vector
        """
        try:
            vect = w2v.get_vector(word)
        except:
            # Doesn't matter what we use for words that aren't found
            # as long as it's rare and has nothing to do with
            # natural disasters
            vect = w2v.get_vector(w_placeholder)
        return vect
    return try_get_vector

In [14]:
try_get_vector = get_vector_func(w2v_model)
words_mappings = {word:vect for word,vect in zip(unique_words, map(try_get_vector,unique_words))}

In [15]:
#We begin by building a couple feature engineering transformers
class SentLenExtractor(BaseEstimator, TransformerMixin):
    """
    Class extracts average sentence length and standard deviation
    of the sentence length from a document.
    I don't expect this to improve the model. This transformer and
    the next were exercises leading to w2vClusters
    """
    
    def __init__(self):
        """
        sent_lengths (list): sentence lengths
        mess_col (str): name of the message column
        """
        self.sent_lengths = None
    
    def calc_sent_lengths(self, text):
        sentence_list = nltk.sent_tokenize(text)
        if sentence_list:
            sent_lengths = [len(s) for s in sentence_list]
        else:
            sent_lengths = [0]
        return sent_lengths
    
    def len_mean(self, text):
        if self.sent_lengths is None:
            self.calc_sent_lengths(text)
        return np.mean(self.sent_lengths)
    
    def len_std(self, text):
        if self.sent_lengths is None:
            self.calc_sent_lengths(text)
        return np.std(self.sent_lengths)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        SL = X.apply(self.calc_sent_lengths)
        X_mean = SL.apply(np.mean).rename('sent_len_mean')
        X_std = SL.apply(np.std).rename('sent_len_std')
        X_len = pd.concat([X_mean, X_std], axis=1)
        return X_len

class PunktCounter(BaseEstimator, TransformerMixin):
    """
    Class calculates the number of punctuation characters in text
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_quest = X.apply(lambda x: x.count('?')).rename('quest_cnt')
        X_comma = X.apply(lambda x: x.count(',')).rename('comma_cnt')
        X_exclam = X.apply(lambda x: x.count('!')).rename('excl_cnt')
        X_punct = pd.concat([X_quest, X_comma, X_exclam], axis=1)
        return X_punct

# TO ADD: feature engineering: word2vec and clusterization
class w2vClusters(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_clusters, words_mappings, 
                 random_state=42, n_jobs=1, tokenize=tokenize):
        """
        Args:
        n_clusters (int) - number of clusters to use for KMeans
        words_mappings (dict) - a dict mapping words to vectors
        random_state (float) - initialization random state for KMeans
        n_jobs (int) - number of parallel jobs for KMeans
        tokenize (func) - sentence tokenizing function
        """
        self.n_clusters = n_clusters
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.words_mappings = words_mappings
        self.tokenize = tokenize
        self.cl_model = KMeans(n_clusters=n_clusters, random_state=random_state, n_jobs=n_jobs)
    
    def ws_to_vs(self, words):
        def w_to_v(word):
            try:
                vec = self.words_mappings[word]
            except:
                # This is meant as a harmless joke
                # I very well know the dangers of hard-coding
                # stuff like this deep into the implementation
                vec = self.words_mappings['supercollider']
            return vec
        vecs = list(map(w_to_v,words))
        return vecs
    
    def fit(self, X, y=None):
        """
        Fits the kmeans model that is used to assign clusters
        to words
        """
        X_token = X.apply(self.tokenize)
        words = list(itertools.chain.from_iterable(X_token))
        vecs = self.ws_to_vs(words)
        self.cl_model.fit(vecs)        

        # Tweets are short, so we need a conflict resolution mechanism for
        # when we'll have just one word in a second or third most
        # frequent cluster
        #
        # We'll be prioritizing the less frequent clusters, so we need to
        # calculate the frequencies
        clusters = self.cl_model.predict(vecs)
        self.cl_counts = pd.Series(clusters).value_counts()
        
        return self
    
    def transform(self, X):
        """
        Attributes words to KMeans clusters and outputs three
        clusters with the highest frequencies in a tweet. In
        case of conflicts takes the globally less frequent tweets.

        Args:
        X - tweet series
        """
        # Clean and extract words
        def count_clusters(tweet):
            words = self.tokenize(tweet)
            
            if not words:
                return [-1000000, -1000000, -1000000]
            # Check if we got any words
            vecs = self.ws_to_vs(words)
        
            # Get clusters
            clusters = self.cl_model.predict(vecs)

            # Count words in each cluster and sort
            x_cl_counts = pd.Series(clusters).value_counts().sort_values(ascending=False)
            #Get three most prominent clusters
            prev_count = 0
            cls = []
            curr_idces = []
            for index, item in x_cl_counts.iteritems():
                if item < prev_count:
                    # Among the most frequent local clusters pick
                    # the least frequent global clusters
                    gl_cl_counts = self.cl_counts.loc[curr_idces].sort_values(ascending=True)
                    to_add = min(3-len(cls),len(gl_cl_counts))
                    cls += list(gl_cl_counts.iloc[:to_add].index)
                    if len(cls) >= 3:
                        break
                    curr_idces = []
                curr_idces.append(index)
                prev_count = item

            # If we didn't get three clusters, add the final ones
            if len(cls) < 3:
                gl_cl_counts = self.cl_counts.loc[curr_idces].sort_values(ascending=True)
                to_add = min(3-len(cls),len(gl_cl_counts))
                cls += list(gl_cl_counts.iloc[:to_add].index)

            # If still not enough, pad with -1000000
            if len(cls) < 3:
                cls += [-1000000] * (3 - len(cls))
            return cls
        X_clusters = X.apply(count_clusters)
        # From https://stackoverflow.com/questions/35491274/pandas-split-column-of-lists-into-multiple-columns
        X_clusters = pd.DataFrame(X_clusters.values.tolist(), index=X_clusters.index, 
                                  columns=['cluster1','cluster2','cluster3'])
        return X_clusters
    
class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print(X.shape)
        print(X)
        # what other output you want
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [15]:
# Testing transformers
pc = SentLenExtractor()
pc.fit(X_train.iloc[:1000])
res = pc.transform(X_train[:1000])
res.iloc[0]

sent_len_mean    24.5
sent_len_std     12.5
Name: 4291, dtype: float64

In [None]:
# Checking the Word2Vec transformer
w2v_cl = w2vClusters(n_clusters=30, n_jobs=3, words_mappings=words_mappings, tokenize=tokenize)
w2v_cl.fit(X_train.iloc[:1000])

In [None]:
res = w2v_cl.transform(X_train[:1000])
res

In [18]:
def tkn(x):
    return tokenize(x, stopwords)
    

In [19]:
# This pipeline incorporates the w2vClusters transformer to incorporate some
# word meanings into the model
s_len = Pipeline([
    ('sl_extract',SentLenExtractor()),
    ('sl_scale',MinMaxScaler())    
])

s_punct = Pipeline([
    ('sp_extract',PunktCounter()),
    ('sp_scale',MinMaxScaler())   
    
])



# Advanced pipeline
pipeline_advanced3 = Pipeline([
    #('debug1',Debug()),
    ('feat', FeatureUnion(
        [('BoW',Pipeline(
            [('vec',CountVectorizer(tokenizer=tkn)),
            ('tfidf',TfidfTransformer())])),
        #('sentlen',s_len),
        #('punkt',s_punct),
        ('cl_freqs',Pipeline([
            # I tried different n_clusters here. Between 20 and 30 works best
            # Too little, and everything ends up in one cluster
            # Too much, and every word in a tweet is in a different cluster.
            ('w2v',w2vClusters(n_clusters=27, n_jobs=3, words_mappings=words_mappings, tokenize=tokenize)),
            ('one_hot',OneHotEncoder(categories='auto',handle_unknown='ignore'))]))
        ])),
    #('debug2',Debug()),
    ('clf',MultiOutputClassifier(estimator=xgb.XGBClassifier(
        random_state=42,n_estimators=200,subsample=0.8,max_depth=4,
        learning_rate=0.1,colsample_bytree=0.4,scale_pos_weight=3)))
])

In [21]:
pipeline_advanced3.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('feat',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('BoW',
                                                 Pipeline(memory=None,
                                                          steps=[('vec',
                                                                  CountVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.int64'>,
                                                                                  encoding='utf-8',
                                                                                  input='content',
                                                                                  lowercase=True,
  

In [47]:
# Testing the efficiency
Y_pred = pipeline_advanced3.predict(X_test)
advanced3_fscores = {}

for ind, col in enumerate(list(Y_test.columns)):
    y_test = list(Y_test.iloc[:,ind])
    y_pred = list(Y_pred[:,ind])
    #print(y_test)
    #print(y_pred)
    #break
    print(col)
    try:
        advanced3_fscores[col] = f1_score(y_test, y_pred)
    except:
        advanced3_fscores[col] = f1_score(y_test, y_pred, average='weighted')
    print('F1-score is {}'.format(advanced3_fscores[col]))  
    
    #print(classification_report(y_test,y_pred))

related
F1-score is 0.7599912211808767
request
F1-score is 0.6801530891197376
offer
F1-score is 0.0625
aid_related
F1-score is 0.7193378480060195
medical_help
F1-score is 0.47282608695652173
medical_products
F1-score is 0.5178571428571428
search_and_rescue
F1-score is 0.30985915492957744
security
F1-score is 0.11199999999999999
military
F1-score is 0.5
water
F1-score is 0.7554980595084086
food
F1-score is 0.7906588824020016
shelter
F1-score is 0.7035830618892508
clothing
F1-score is 0.5217391304347827
money
F1-score is 0.4711111111111111
missing_people
F1-score is 0.4057971014492754
refugees
F1-score is 0.4158415841584159
death
F1-score is 0.6486486486486487
other_aid
F1-score is 0.4208885424785659
infrastructure_related
F1-score is 0.2505050505050505
transport
F1-score is 0.40443213296398894
buildings
F1-score is 0.5386138613861386
electricity
F1-score is 0.5148514851485149
tools
F1-score is 0.06451612903225806
hospitals
F1-score is 0.20930232558139533
shops
F1-score is 0.051282051282

In [28]:
cumsum = 0
for col, item in advanced2_fscores.items():
    print(col)
    delta = advanced3_fscores[col]-item
    print("F1-score with word2Vec - F1-score with column subsample: {}".format(delta))
    cumsum += delta
print("Cumulative F1-score difference between word2Vec-including model and the next-best model is {}".format(cumsum))

related
F1-score with word2Vec - F1-score with column subsample: 0.008842486905648705
request
F1-score with word2Vec - F1-score with column subsample: 0.008110078367049467
offer
F1-score with word2Vec - F1-score with column subsample: 0.0
aid_related
F1-score with word2Vec - F1-score with column subsample: 0.0036828959293422336
medical_help
F1-score with word2Vec - F1-score with column subsample: 0.0009468002624202221
medical_products
F1-score with word2Vec - F1-score with column subsample: -0.002142857142857224
search_and_rescue
F1-score with word2Vec - F1-score with column subsample: -0.001461599787403678
security
F1-score with word2Vec - F1-score with column subsample: 0.011999999999999983
military
F1-score with word2Vec - F1-score with column subsample: -0.0031847133757961776
water
F1-score with word2Vec - F1-score with column subsample: -0.015930511920162704
food
F1-score with word2Vec - F1-score with column subsample: -0.0016680567139284452
shelter
F1-score with word2Vec - F1-sco

### Notes on further work
There is a lot more than can be done. I got the word2Vec-including model to the point where it has a positive impact (and doesn't react too strongly to a change in the number of clusters), but there is room to choose different clusterization algorithms, and engineer different features from them.
There is room for good old-fashioned hyper-parameter optimization. We could have started with widely different classifiers and then polled them using log-reg or done something similar.
Testing these ideas is outside the scope of this project, but maybe I'll return to them when I work on my final project, if I pick a natural language processing problem.

### 9. Export your model as a pickle file

In [22]:
joblib.dump(pipeline_advanced3,'pipeline_advanced3.pkl')


['pipeline_advanced3.pkl']

In [23]:
pipeline_advanced4 = joblib.load('pipeline_advanced3.pkl')

In [24]:
pipeline_advanced4.predict(X_test)

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 1]], dtype=int64)

In [25]:
sklearn.__version__

'0.21.2'

In [38]:
labels = Y_test.columns
labels

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'water', 'food', 'shelter', 'clothing', 'money', 'missing_people',
       'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport',
       'buildings', 'electricity', 'tools', 'hospitals', 'shops',
       'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
       'storm', 'fire', 'earthquake', 'cold', 'other_weather',
       'direct_report'],
      dtype='object')

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [None]:
import sys

import pandas as pd
import numpy as np
from datetime import datetime
import itertools
import re
import nltk
import xgboost as xgb
from sqlalchemy import create_engine
from nltk.corpus import stopwords as nl_stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from nltk.stem import WordNetLemmatizer
  
nltk.download('punkt')
nltk.download('stopwords')

import gensim

import dill
import pickle

#I've rearranged the functions a bit to tailor them to my implementation

def load_data(database_filepath):
    """
    Load data and split it into target and explanotary variables
    
    Args:
    database_filepath(str): path to the database
    
    Out:
    X_text(Series): a Series of messages
    """
    engine = create_engine('sqlite:///database/disaster_response.db')
    df = pd.read_sql_table('categorized_messages', con=engine)
    X_text = df['message']
    X_genre = df['genre']
    Y = df.drop(['id','message','original','genre'], axis=1)
    Y = Y.drop(['child_alone'], axis=1)
    X_text.reset_index()
    Y.reset_index()
    return X_text, Y

def tokenize(text,stopwords=None):
    """
    Function performs basic tokenization:
    1. Conversion to lowercase
    2. Removal of special characters
    3. Tokenization using NLTK
    4. Removal of stopwords
    
    Args:
    text (str): text to be tokenized
    
    Out:
    words (list): a list of tokens
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]'," ",text)
    words = nltk.word_tokenize(text)
    if stopwords:
        words = [w for w in words if w not in stopwords]
    
    return words

def get_vector_func(w2v, w_placeholder='supercollider'):
    """
    The function sets the Word2Vec model
    for the inner get_vector function
    and returns it.

    Args:
    w2v (Word2VecKeyedVectors) - a Word2Vec model
    w_placeholder (str) - the word that we'll replace
        missing words with. Doesn't matter what it is
        as long as it's rare and has nothing to do
        with natural disasters

    Out:
    try_get_vector (func) - a function that allows
    words tobe missing from the vocabulary
    """
    def try_get_vector(word):
        """
        This inner function implements exception handling
        for Word2VecKeyedVectors.get_vector
        """
        try:
            vect = w2v.get_vector(word)
        except:
            # Doesn't matter what we use for words that aren't found
            # as long as it's rare and has nothing to do with
            # natural disasters
            vect = w2v.get_vector(w_placeholder)
        return vect
    return try_get_vector

#We begin by building a couple feature engineering transformers
class SentLenExtractor(BaseEstimator, TransformerMixin):
    """
    Class extracts average sentence length and standard deviation
    of the sentence length from a document.
    I don't expect this to improve the model. This transformer and
    the next were exercises leading to w2vClusters
    """
    
    def __init__(self):
        """
        sent_lengths (list): sentence lengths
        mess_col (str): name of the message column
        """
        self.sent_lengths = None
    
    def calc_sent_lengths(self, text):
        sentence_list = nltk.sent_tokenize(text)
        if sentence_list:
            sent_lengths = [len(s) for s in sentence_list]
        else:
            sent_lengths = [0]
        return sent_lengths
    
    def len_mean(self, text):
        if self.sent_lengths is None:
            self.calc_sent_lengths(text)
        return np.mean(self.sent_lengths)
    
    def len_std(self, text):
        if self.sent_lengths is None:
            self.calc_sent_lengths(text)
        return np.std(self.sent_lengths)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        SL = X.apply(self.calc_sent_lengths)
        X_mean = SL.apply(np.mean).rename('sent_len_mean')
        X_std = SL.apply(np.std).rename('sent_len_std')
        X_len = pd.concat([X_mean, X_std], axis=1)
        return X_len

class PunktCounter(BaseEstimator, TransformerMixin):
    """
    Class calculates the number of punctuation characters in text
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_quest = X.apply(lambda x: x.count('?')).rename('quest_cnt')
        X_comma = X.apply(lambda x: x.count(',')).rename('comma_cnt')
        X_exclam = X.apply(lambda x: x.count('!')).rename('excl_cnt')
        X_punct = pd.concat([X_quest, X_comma, X_exclam], axis=1)
        return X_punct

# TO ADD: feature engineering: word2vec and clusterization
class w2vClusters(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_clusters, words_mappings, 
                 random_state=42, n_jobs=1, tokenize=tokenize):
        """
        Args:
        n_clusters (int) - number of clusters to use for KMeans
        words_mappings (dict) - a dict mapping words to vectors
        random_state (float) - initialization random state for KMeans
        n_jobs (int) - number of parallel jobs for KMeans
        tokenize (func) - sentence tokenizing function
        """
        self.n_clusters = n_clusters
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.words_mappings = words_mappings
        self.tokenize = tokenize
        self.cl_model = KMeans(n_clusters=n_clusters, random_state=random_state, n_jobs=n_jobs)
    
    def ws_to_vs(self, words):
        def w_to_v(word):
            try:
                vec = self.words_mappings[word]
            except:
                # This is meant as a harmless joke
                # I very well know the dangers of hard-coding
                # stuff like this deep into the implementation
                vec = self.words_mappings['supercollider']
            return vec
        vecs = list(map(w_to_v,words))
        return vecs
    
    def fit(self, X, y=None):
        """
        Fits the kmeans model that is used to assign clusters
        to words
        """
        X_token = X.apply(self.tokenize)
        words = list(itertools.chain.from_iterable(X_token))
        vecs = self.ws_to_vs(words)
        self.cl_model.fit(vecs)        

        # Tweets are short, so we need a conflict resolution mechanism for
        # when we'll have just one word in a second or third most
        # frequent cluster
        #
        # We'll be prioritizing the less frequent clusters, so we need to
        # calculate the frequencies
        clusters = self.cl_model.predict(vecs)
        self.cl_counts = pd.Series(clusters).value_counts()
        
        return self
    
    def transform(self, X):
        """
        Attributes words to KMeans clusters and outputs three
        clusters with the highest frequencies in a tweet. In
        case of conflicts takes the globally less frequent tweets.

        Args:
        X - tweet series
        """
        # Clean and extract words
        def count_clusters(tweet):
            words = self.tokenize(tweet)
            
            if not words:
                return [-1000000, -1000000, -1000000]
            # Check if we got any words
            vecs = self.ws_to_vs(words)
        
            # Get clusters
            clusters = self.cl_model.predict(vecs)

            # Count words in each cluster and sort
            x_cl_counts = pd.Series(clusters).value_counts().sort_values(ascending=False)
            #Get three most prominent clusters
            prev_count = 0
            cls = []
            curr_idces = []
            for index, item in x_cl_counts.iteritems():
                if item < prev_count:
                    # Among the most frequent local clusters pick
                    # the least frequent global clusters
                    gl_cl_counts = self.cl_counts.loc[curr_idces].sort_values(ascending=True)
                    to_add = min(3-len(cls),len(gl_cl_counts))
                    cls += list(gl_cl_counts.iloc[:to_add].index)
                    if len(cls) >= 3:
                        break
                    curr_idces = []
                curr_idces.append(index)
                prev_count = item

            # If we didn't get three clusters, add the final ones
            if len(cls) < 3:
                gl_cl_counts = self.cl_counts.loc[curr_idces].sort_values(ascending=True)
                to_add = min(3-len(cls),len(gl_cl_counts))
                cls += list(gl_cl_counts.iloc[:to_add].index)

            # If still not enough, pad with -1000000
            if len(cls) < 3:
                cls += [-1000000] * (3 - len(cls))
            return cls
        X_clusters = X.apply(count_clusters)
        # From https://stackoverflow.com/questions/35491274/pandas-split-column-of-lists-into-multiple-columns
        X_clusters = pd.DataFrame(X_clusters.values.tolist(), index=X_clusters.index, 
                                  columns=['cluster1','cluster2','cluster3'])
        return X_clusters
    
class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print(X.shape)
        print(X)
        # what other output you want
        return X

    def fit(self, X, y=None, **fit_params):
        return self

def build_model(words_mappings,tokenize,stopwords):
    # This pipeline incorporates the w2vClusters transformer to incorporate some
    # word meanings into the model
    s_len = Pipeline([
        ('sl_extract',SentLenExtractor()),
        ('sl_scale',MinMaxScaler())    
    ])

    s_punct = Pipeline([
        ('sp_extract',PunktCounter()),
        ('sp_scale',MinMaxScaler())   

    ])

    # Advanced pipeline
    pipeline_advanced3 = Pipeline([
        #('debug1',Debug()),
        ('feat', FeatureUnion(
            [('BoW',Pipeline(
                [('vec',CountVectorizer(tokenizer=lambda x: tokenize(x, stopwords))),
                ('tfidf',TfidfTransformer())])),
            #('sentlen',s_len),
            #('punkt',s_punct),
            ('cl_freqs',Pipeline([
                # I tried different n_clusters here. Between 20 and 30 works best
                # Too little, and everything ends up in one cluster
                # Too much, and every word in a tweet is in a different cluster.
                ('w2v',w2vClusters(n_clusters=27, n_jobs=3, words_mappings=words_mappings, tokenize=tokenize)),
                ('one_hot',OneHotEncoder(categories='auto',handle_unknown='ignore'))]))
            ])),
        #('debug2',Debug()),
        ('clf',MultiOutputClassifier(estimator=xgb.XGBClassifier(
            random_state=42,n_estimators=200,subsample=0.8,max_depth=4,
            learning_rate=0.1,colsample_bytree=0.4,scale_pos_weight=3)))
    ])
    pipeline_advanced3.fit()
    
    return pipeline_advanced3


def evaluate_model(model, X_test, Y_test, category_names):
    # Testing the efficiency
    Y_pred = model.predict(X_test)
    model_fscores = {}

    for ind, col in enumerate(list(Y_test.columns)):
        y_test = list(Y_test.iloc[:,ind])
        y_pred = list(Y_pred[:,ind])
        print(col)
        try:
            model_fscores[col] = f1_score(y_test, y_pred)
        except:
            model_fscores[col] = f1_score(y_test, y_pred, average='weighted')
        print('F1-score is {}'.format(advanced3_fscores[col]))  

        #print(classification_report(y_test,y_pred))


def save_model(model, model_filepath):
    joblib.dump(model,model_filepath)

def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X_text, Y = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X_text, Y, test_size=0.2, random_state=42)
        
        stopwords = nl_stopwords.words('english')
        #Make preparations for the word2Vec part
        try:
            #Load in word2vec
            w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./.word2vec/GoogleNews-vectors-negative300.bin',binary=True)
        except:
            raise ValueError('Word2Vec missing at ./.word2vec/GoogleNews-vectors-negative300.bin')
        
        #The goal is to create a mapping that maps a word to a word cluster
        #We'll later use the three most prominent cluster numbers as factors
        #and try to improve the model this way

        #First we need to get all the words into one array
        all_words = []
        i=0
        for sent in X_text:
            all_words += tokenize(sent)
            
        #Now we map words to vectors
        unique_words = set(all_words)
        unique_words.update('supercollider')
        unique_words = list(unique_words)
        
        #Allow words to not be found
        try_get_vector = get_vector_func(w2v_model)
        words_mappings = {word:vect for word,vect in zip(unique_words, map(try_get_vector,unique_words))}
        
        print('Building model...')
        model = build_model(words_mappings,tokenize,stopwords)
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()