# ML Pipeline Preparation


# 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database 
- Define feature and target variables X and Y

In [1]:
import nltk
nltk.download(['punkt','wordnet','averaged_perceptron_tagger','stopwords'])

# import libraries
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import fbeta_score,classification_report,make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib
import re
import pandas as pd
import sqlite3
import numpy as np


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ziaeeamir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  from numpy.core.umath_tests import inner1d


In [2]:
# Load dataset from database 
db = sqlite3.connect('../data/Database.db')
cursor = db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()[0][0]
df = pd.read_sql_query('SELECT * FROM '+tables,db)


In [3]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df['message']
y = df.iloc[:, 4:]
category_names = list(df.columns[4:])

In [5]:
print(X.iloc[1],'\n',y.iloc[1])

Is the Hurricane over or is it not over 
 related                   1
request                   0
offer                     0
aid_related               1
medical_help              0
medical_products          0
search_and_rescue         0
security                  0
military                  0
child_alone               0
water                     0
food                      0
shelter                   0
clothing                  0
money                     0
missing_people            0
refugees                  0
death                     0
other_aid                 1
infrastructure_related    0
transport                 0
buildings                 0
electricity               0
tools                     0
hospitals                 0
shops                     0
aid_centers               0
other_infrastructure      0
weather_related           1
floods                    0
storm                     1
fire                      0
earthquake                0
cold                      0
other_

# 2. The tokenization function to process the text data

In [6]:
def tokenize(text):
    # Remove punctuation
    text = re.sub(r'[^a-zA-Z0-9]', ' ',text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()


    # Remove stop words
    # tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok, pos='n').strip()
        #I passed in the output from the previous noun lemmatization step. This way of chaining procedures is very common.
        clean_tok = lemmatizer.lemmatize(clean_tok, pos='v')
        #It is common to apply both, lemmatization first, and then stemming.
        clean_tok =PorterStemmer().stem(clean_tok)
        
        clean_tokens.append(clean_tok)
    
    
    return clean_tokens

In [7]:
tokenize(X.iloc[1])

['Is', 'the', 'hurrican', 'over', 'or', 'be', 'it', 'not', 'over']

In [8]:
X.iloc[1]

'Is the Hurricane over or is it not over'

### 2.1 The custom transformer that extracts  starting verb

In [9]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    
    def starting_verb(self, text):
        
        # tokenize by sentence
        sentence_list = sent_tokenize(text)
        
        for sentence in sentence_list:
            
            # tokenize each sentence into words and tag part of speech
            pos_tags = pos_tag(tokenize(sentence))
            
            if len(pos_tags) != 0:
                # index pos_tags to get the first word and part of speech tag
                first_word, first_tag = pos_tags[0]

                # return true if the part of speech is an apporpriate verb
                if first_tag in ['VB','VBP']:
                    return True
            
        return False
        
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        # apply starting_verb function to all values in X
        X_tagged = pd.Series(X).apply(self.starting_verb)
        
        return pd.DataFrame(X_tagged)
        

### 2.2 The custom transformer that converts all text to lowercase

In [10]:
class CaseNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.Series(X).apply(lambda x: x.lower()).values

# 3. The machine learning pipeline
This machine pipeline takes in the `message` column as input and output classification results on the other 36 categories in the dataset. This [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) function is helpful for predicting multiple target variables.

In [11]:
def get_pipeline(clf=RandomForestClassifier()):
    
    pipeline = Pipeline([
                        ('features',FeatureUnion([
                                                 ('text-pipeline',Pipeline([
                                                                            ('lowercase', CaseNormalizer()),
                                                                            ('vect', CountVectorizer(tokenizer= tokenize)),
                                                                            ('tfidf', TfidfTransformer())
                                                                           ])),
                                                 ('starting-verb',StartingVerbExtractor())
                                                 ])),
                        ('clf', MultiOutputClassifier(clf))
                        ])
    return pipeline

In [12]:
pipeline = get_pipeline()
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'features', 'clf', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__text-pipeline', 'features__starting-verb', 'features__text-pipeline__memory', 'features__text-pipeline__steps', 'features__text-pipeline__lowercase', 'features__text-pipeline__vect', 'features__text-pipeline__tfidf', 'features__text-pipeline__vect__analyzer', 'features__text-pipeline__vect__binary', 'features__text-pipeline__vect__decode_error', 'features__text-pipeline__vect__dtype', 'features__text-pipeline__vect__encoding', 'features__text-pipeline__vect__input', 'features__text-pipeline__vect__lowercase', 'features__text-pipeline__vect__max_df', 'features__text-pipeline__vect__max_features', 'features__text-pipeline__vect__min_df', 'features__text-pipeline__vect__ngram_range', 'features__text-pipeline__vect__preprocessor', 'features__text-pipeline__vect__stop_words', 'features__text-pipeline__vect__strip_accents', 'features__text-pipeline__v

# 4. To train pipeline
- Split data into train and test sets
- Train pipeline

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y)
print('The training set shape {} and the grount truth shape {}'.format(X_train.shape, y_train.shape))
print('The testing set shape {} and the grount truth shape {}'.format(X_test.shape, y_test.shape))

The training set shape (19662,) and the grount truth shape (19662, 36)
The testing set shape (6554,) and the grount truth shape (6554, 36)


In [14]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text-pipeline', Pipeline(memory=None,
     steps=[('lowercase', CaseNormalizer()), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='conten...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])

# 5.To test the model
To report the fbeta score for the whole model, precision and recall for each output category of the dataset by iterating through the columns and calling sklearn's `classification_report` on each.

In [15]:
def get_fbeta_score(y_true, y_pred):

    """
    Compute F_beta score, the weighted harmonic mean of precision and recall

    Parameters
    ----------
    y : Pandas Dataframe
        y true
    y_pred : array 
        y predicted 

    Returns
    -------
    fbeta_score : float
    """
    score_list = []
    if isinstance(y_pred, pd.DataFrame) == True:
        y_pred = y_pred.values
    if isinstance(y_true, pd.DataFrame) == True:
        y_true = y_true.values
        
    for index, col in enumerate(y_test.columns):
        error = fbeta_score(y_test[col], y_pred[:,index],1,average='weighted')
        score_list.append(error)
        
    fb_score_numpy = np.asarray(score_list)
    fb_score_numpy = fb_score_numpy[fb_score_numpy<1]
    fb_score = np.mean(fb_score_numpy)
    
    return fb_score

In [16]:
y_pred = pipeline.predict(X_test)
overall_accuracy = (y_pred == y_test).mean().mean()
fb_score = get_fbeta_score(y_test, y_pred)

  'precision', 'predicted', average, warn_for)


In [17]:
print('Average overall accuracy {0:.2f}% \n'.format(overall_accuracy*100))
print('Fbeta score {0:.2f}%\n'.format(fb_score*100))

Average overall accuracy 94.17% 

Fbeta score 92.55%



In [18]:
# iterating through the columns and calling sklearn's classification_report on each.
y_pred_pd = pd.DataFrame(y_pred, columns = y_test.columns)
for column in y_test.columns:
    print('------------------------------------------------------\n')
    print('FEATURE: {}\n'.format(column))
    print(classification_report(y_test[column],y_pred_pd[column]))

  'precision', 'predicted', average, warn_for)


------------------------------------------------------

FEATURE: related

             precision    recall  f1-score   support

          0       0.62      0.38      0.47      1533
          1       0.82      0.93      0.87      4969
          2       0.43      0.29      0.34        52

avg / total       0.77      0.79      0.77      6554

------------------------------------------------------

FEATURE: request

             precision    recall  f1-score   support

          0       0.89      0.98      0.93      5428
          1       0.84      0.40      0.54      1126

avg / total       0.88      0.88      0.87      6554

------------------------------------------------------

FEATURE: offer

             precision    recall  f1-score   support

          0       0.99      1.00      1.00      6518
          1       0.00      0.00      0.00        36

avg / total       0.99      0.99      0.99      6554

------------------------------------------------------

FEATURE: aid_related

    

# 6.To  Improve the model
Use grid search to find better parameters. 

In [19]:
def build_model(clf = RandomForestClassifier()):
    
    pipeline = get_pipeline(clf)

        # specify parameters for grid search
    parameters = {  
                        #clf__estimator__min_samples_split': [2, 4],
                        #lf__estimator__criterion': ['log2', 'auto', 'sqrt', None],
                        #eatures__text-pipeline__tfidf__use_idf' : [True, False],
                        'clf__estimator__criterion': ['gini', 'entropy'],
                        #clf__estimator__max_depth': [None, 25, 50, 100, 150, 200],
                     }


    make_score= make_scorer(get_fbeta_score,greater_is_better=True)
        # create grid search object
    cv = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring=make_score)
    cv.fit(X_train,y_train)
    
    return cv

In [20]:
model = build_model()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# 7. To test the model

To show the accuracy, precision, and recall of the tuned model.  


In [21]:
y_pred = model.predict(X_test)
overall_accuracy = (y_pred == y_test).mean().mean()
fb_score = get_fbeta_score(y_test, y_pred)

  'precision', 'predicted', average, warn_for)


In [22]:
print('Average overall accuracy {0:.2f}% \n'.format(overall_accuracy*100))
print('Fbeta score {0:.2f}%\n'.format(fb_score*100))

Average overall accuracy 94.05% 

Fbeta score 92.26%



# 9. To Export the model as a pickle file

In [24]:
filename = 'classifier1.joblib'
joblib.dump(model, open('../models/'+filename, 'wb'))