# ML Pipeline Preparation

## IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# DOWNLOAD NECEASSRY NLTK DATA
import nltk
nltk.download(['punkt', 'wordnet'])
from nltk.corpus import stopwords

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adrien\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adrien\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## LOADING DATABASE

In [2]:
# LOADING DATABASE FROM SQLITE DATABASE

engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql('SELECT * FROM DisasterMessages', engine)

In [3]:
# GET VALUE COUNTS PER COLUMN FROM DF
# EXPECTING: 2 VALUES AS 0 AND 1 FOR CATEGORIES.
for column in df.columns:
    if len(df[column].value_counts()) != 2:
        print("feature '{}' has more or less than 2 values.".format(column))

feature 'id' has more or less than 2 values.
feature 'message' has more or less than 2 values.
feature 'original' has more or less than 2 values.
feature 'genre' has more or less than 2 values.
feature 'related' has more or less than 2 values.
feature 'child_alone' has more or less than 2 values.


X takes 'message'.

y takes everything else except 'id', 'message', 'original', 'genre'.

Regarding 'child_alone' or 'related' (considered as categories in our dataframe), deeper investigation must be done for the values.

In [4]:
df['related'].value_counts()

1    19906
0     6122
2      188
Name: related, dtype: int64

rows with value 2 will de dropped from 'related'.

In [5]:
df['child_alone'].value_counts()

0    26216
Name: child_alone, dtype: int64

'child_alone' doesn't bring anything to our model as it has only 0 as unique values.

In [6]:
categories_disaster = list(df.columns[4:])

df = df[(df.related != 2) & (df[categories_disaster].sum(axis=1) != 0)]

X = df['message']

y = df.drop(['id','message', 'original','genre'], axis=1)

## TOKENIZATION OF MESSAGES

From the languages of messages, I noticed that French and English are present.

In [7]:
def get_wordnet_pos(word):
    """
    Map POS tag to first character lemmatize() accepts
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [8]:
def tokenize(text):

    text = text.lower()
    
    # ENSURING THAT NO MAIL, URLS OR IPS ARE IN OUR TEXT
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    emails_regex = '[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+'
    ips_regex = '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})'
    
    # STOPWORDS IN ENGLISH AND FRENCH
    final_stopwords_list = stopwords.words('english') + stopwords.words('french')

    # DETECTING URLS, MAILS OR IPS.
    # IT MIGHT BE THAT IN 1 TEXT MANY URLS / IPS / MAILS EXIST

    detected_urls = re.findall(url_regex,text)
    detected_emails = re.findall(emails_regex,text)
    detected_emails = [email.split()[0] for email in detected_emails]
    detected_ips = re.findall(ips_regex,text)
    
    # NORMALIZING TEXT
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # REMOVING NUMBERS
    num_pattern = re.compile(r'[^a-zA-Z]')
    text = re.sub(num_pattern,' ',text)
    
    # REPLACING ELEMENTS WITH ' '
    for url in detected_urls:
        text = text.replace(url,' ')    
                 
    for email in detected_emails:
        text = text.replace(email,' ')
            
    for ip in detected_ips:
        text = text.replace(ip,' ')       
    
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok, get_wordnet_pos(tok)).strip()
        #clean_tok = lemmatizer.lemmatize(tok).strip()
        
        #REMOVING STOP WORDS
        if(clean_tok not in final_stopwords_list):
            clean_tokens.append(clean_tok)

    return clean_tokens

In [9]:
for message in X[:10]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')

Weather update - a cold front from Cuba that could pass over Haiti
['weather', 'update', 'cold', 'front', 'cuba', 'could', 'haiti'] 

Is the Hurricane over or is it not over
['hurricane'] 

Looking for someone but no name
['look', 'someone', 'name'] 

UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
['report', 'leogane', 'destroyed', 'hospital', 'st', 'croix', 'function', 'need', 'supply', 'desperately'] 

says: west side of Haiti, rest of the country today and tonight
['say', 'west', 'side', 'haiti', 'rest', 'country', 'today', 'tonight'] 

Storm at sacred heart of jesus
['storm', 'sacred', 'heart', 'jesus'] 

Please, we need tents and water. We are in Silo, Thank you!
['please', 'need', 'tent', 'water', 'silo', 'thank'] 

I am in Croix-des-Bouquets. We have health issues. They ( workers ) are in Santo 15. ( an area in Croix-des-Bouquets )
['croix', 'bouquet', 'health', 'issue', 'worker', 'santo', 'area', 'croix', 'bouquet'] 

There'

## PIPELINE

In [10]:
# DEFINING MACHINE LEARNING PIPELINE

pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))
    ])


# TRAINING MACHINE LEARNING MODEL
X_train, X_test, y_train, y_test = train_test_split(X, y)

# TRAINING CLASSIFIER
pipeline.fit(X_train, y_train)

# PREDICTING ON TEST DATA
y_pred = pipeline.predict(X_test)

# DISPLAYING RESULTS
for i, category in enumerate(y.columns):
    print("Feature: {}\n".format(category))
    print(classification_report(y_test[category], y_pred[:,i]))



Feature: related

              precision    recall  f1-score   support

           0       0.64      0.45      0.53      1551
           1       0.84      0.92      0.88      4956

    accuracy                           0.81      6507
   macro avg       0.74      0.68      0.70      6507
weighted avg       0.79      0.81      0.80      6507

Feature: request

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      5390
           1       0.80      0.45      0.57      1117

    accuracy                           0.89      6507
   macro avg       0.85      0.71      0.75      6507
weighted avg       0.88      0.89      0.87      6507

Feature: offer

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6486
           1       0.00      0.00      0.00        21

    accuracy                           1.00      6507
   macro avg       0.50      0.50      0.50      6507
weighted avg       0.9

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6198
           1       0.78      0.27      0.40       309

    accuracy                           0.96      6507
   macro avg       0.87      0.63      0.69      6507
weighted avg       0.96      0.96      0.95      6507

Feature: other_aid

              precision    recall  f1-score   support

           0       0.88      0.99      0.93      5681
           1       0.55      0.07      0.12       826

    accuracy                           0.87      6507
   macro avg       0.72      0.53      0.52      6507
weighted avg       0.84      0.87      0.83      6507

Feature: infrastructure_related

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      6051
           1       0.18      0.01      0.02       456

    accuracy                           0.93      6507
   macro avg       0.56      0.50      0.49      6507
weighted avg       0.

In [78]:
print(classification_report(y_test,y_pred,target_names=categories_disaster))

                        precision    recall  f1-score   support

               related       0.85      0.92      0.88      5003
               request       0.79      0.46      0.58      1121
                 offer       0.00      0.00      0.00        25
           aid_related       0.75      0.61      0.67      2714
          medical_help       0.65      0.09      0.16       550
      medical_products       0.61      0.06      0.11       341
     search_and_rescue       0.50      0.02      0.03       166
              security       0.25      0.01      0.02       107
              military       0.64      0.07      0.12       209
                 water       0.90      0.38      0.54       432
                  food       0.84      0.53      0.65       720
               shelter       0.80      0.33      0.47       601
              clothing       0.55      0.06      0.12        93
                 money       0.45      0.03      0.06       150
        missing_people       0.00      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## IMPROVING MODEL

In [13]:
# APPLYING GRIDSEARCH TO OPTIMIZE MODEL

# DEFINING MACHINE LEARNING PIPELINE

pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))
    ])

parameters = {
        'clf__estimator__n_estimators': [50,100],
        #'clf__estimator__min_samples_split': [2, 3],
        #'clf__estimator__max_depth': [8, 9],
}

# TRAINING MACHINE LEARNING MODEL
X_train, X_test, y_train, y_test = train_test_split(X, y)

cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2, cv=2, n_jobs=1)

# TRAINING CLASSIFIER
cv.fit(X_train, y_train)

# PREDICTING ON TEST DATA
y_pred = cv.predict(X_test)

# DISPLAYING RESULTS
for i, category in enumerate(y.columns):
    print("Feature: {}\n".format(category))
    print(classification_report(y_test[category], y_pred[:,i]))

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] clf__estimator__n_estimators=50 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................. clf__estimator__n_estimators=50, total= 7.2min
[CV] clf__estimator__n_estimators=50 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.2min remaining:    0.0s


KeyboardInterrupt: 

In [24]:
#FUNCTION TO GET IMPORTANCE OF FEATURES

def get_feature_importance(model, category_names, database_filepath):
    '''collect important features from model and store in database
    Function get the weights of most important (words)
    features, their weights, and the category in database
    'word' table after training
    Args:
      model: name of model
      category_names: list of category name of array Y
      database_filepath: name of database containing data
    Returns:
      None
    ''' 
    # TAKE THE BEST ESTIMATOR FROM MODEL (FROM GRIDSEARCHCV)
    best_pipeline = model.best_estimator_
    col_name = []
    imp_value = []
    imp_word = []
    # List vocabulary
    x_name = best_pipeline.named_steps['vect'].get_feature_names()
    
    # GET FEATURE IMPORTANCES FROM THE LEARNING MODEL AND FOR A SPECIFIC CATEGORY
    for j, col in enumerate(category_names):
        x_imp = best_pipeline.named_steps['clf_ada'].estimators_[j].feature_importances_
        
        # LIMIT FOR WEIGHT OF FEATURES SET TO MINMUM HALF MAX WEIGHT
        value_max = x_imp.max() / 2.0
        
        # GET FEATURES NOT LESS THAN HALF MAX WEIGHT PER COLUMN - NO POINT TO DISPLAY ALL FEATURES
        for i,value in enumerate(x_imp):
            if(value > value_max):
                col_name.append(col)
                imp_value.append(value)
                imp_word.append(x_name[i])

    # PREPARING DATAFRAME
    col_name = np.array(col_name).reshape(-1, 1)
    imp_value = np.array(imp_value).reshape(-1, 1)
    imp_word = np.array(imp_word).reshape(-1, 1)
    imp_array = np.concatenate((col_name, imp_value, imp_word), axis=1)
    df_imp = pd.DataFrame(imp_array, columns=['category_name', 'importance_value', 'important_word'])  
    
    # IMPORTANCE VALUE SHOULD BE A FLOAT
    df_imp.importance_value = pd.to_numeric(df_imp.importance_value, downcast='float')

    # CREATING SQL ENGINE
    engine = create_engine('sqlite:///' + database_filepath)

    # SAVING DATAFRAME INTO A TABLE
    df_imp.to_sql('Words', engine, if_exists='replace', index=False) 
    df_imp = pd.read_sql("SELECT * FROM Words")
    
    print('Sample feature importance...')
    print(df_imp.head())

KeyboardInterrupt: 