### ML Pipeline

In [10]:
# import libraries
from sqlalchemy import create_engine
import nltk
nltk.download(['punkt', 'wordnet'])

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\divyam07\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\divyam07\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
# load data from database
engine = create_engine('sqlite:///ETL_Preparation.db')
df = pd.read_sql_table('ETL_Preparation', engine)
X = df.message
y = df[df.columns[4:]]
category_names = y.columns
category_names

Index(['('related',)', '('request',)', '('offer',)', '('aid_related',)',
       '('medical_help',)', '('medical_products',)', '('search_and_rescue',)',
       '('security',)', '('military',)', '('child_alone',)', '('water',)',
       '('food',)', '('shelter',)', '('clothing',)', '('money',)',
       '('missing_people',)', '('refugees',)', '('death',)', '('other_aid',)',
       '('infrastructure_related',)', '('transport',)', '('buildings',)',
       '('electricity',)', '('tools',)', '('hospitals',)', '('shops',)',
       '('aid_centers',)', '('other_infrastructure',)', '('weather_related',)',
       '('floods',)', '('storm',)', '('fire',)', '('earthquake',)',
       '('cold',)', '('other_weather',)', '('direct_report',)'],
      dtype='object')

In [13]:
category_names=category_names.str.replace('(',"").str.replace(')',"").str.replace("'","").str.replace(",","")

In [14]:
#  tokenization function to process your text data
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [15]:
# Build a machine learning pipeline
# This machine pipeline should take in the message column as input and output classification results on the other 36 categories in the dataset.
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

In [16]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                  

In [17]:
y_pred = pipeline.predict(X_test)

for i in range(36):
    print(y_test.columns[i], ':')
    print(classification_report(y_test.iloc[:,i], y_pred[:,i]), '...................................................')

('related',) :
              precision    recall  f1-score   support

           0       0.62      0.34      0.44      1873
           1       0.81      0.93      0.87      5934
           2       0.50      0.14      0.22        58

    accuracy                           0.79      7865
   macro avg       0.64      0.47      0.51      7865
weighted avg       0.76      0.79      0.76      7865
 ...................................................
('request',) :
              precision    recall  f1-score   support

           0       0.88      0.99      0.93      6533
           1       0.84      0.35      0.50      1332

    accuracy                           0.88      7865
   macro avg       0.86      0.67      0.71      7865
weighted avg       0.87      0.88      0.86      7865
 ...................................................
('offer',) :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7829
           1       0.00      0.00   

 ...................................................
('shops',) :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7837
           1       0.00      0.00      0.00        28

    accuracy                           1.00      7865
   macro avg       0.50      0.50      0.50      7865
weighted avg       0.99      1.00      0.99      7865
 ...................................................
('aid_centers',) :
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      7762
           1       0.00      0.00      0.00       103

    accuracy                           0.99      7865
   macro avg       0.49      0.50      0.50      7865
weighted avg       0.97      0.99      0.98      7865
 ...................................................
('other_infrastructure',) :
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      7524
           1       0

In [19]:
# Export your model as a pickle file
with open('classifier.pkl', 'wb') as file:
    pickle.dump(pipeline, file)