In [1]:
# importing necessary libraries

import pandas as pd
import numpy as np
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
sentence = """At eight o'clock on Thursday morning Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)
tokens

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [4]:
#  load data fromm databse
engine=create_engine('sqlite:///InsertDatabaseName.db')
df=pd.read_sql_table("disaster_messages",con=engine)

In [5]:
df

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26210,30261,The training demonstrated how to enhance micro...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26211,30262,A suitable candidate has been selected and OCH...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,30263,"Proshika, operating in Cox's Bazar municipalit...",,news,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,30264,"Some 2,000 women protesting against the conduc...",,news,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X=df['message']
y=df.iloc[:,4:]

In [7]:
X

0        Weather update - a cold front from Cuba that c...
1                  Is the Hurricane over or is it not over
2                          Looking for someone but no name
3        UN reports Leogane 80-90 destroyed. Only Hospi...
4        says: west side of Haiti, rest of the country ...
                               ...                        
26210    The training demonstrated how to enhance micro...
26211    A suitable candidate has been selected and OCH...
26212    Proshika, operating in Cox's Bazar municipalit...
26213    Some 2,000 women protesting against the conduc...
26214    A radical shift in thinking came about as a re...
Name: message, Length: 26215, dtype: object

In [8]:
y

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Function to process the text data

In [9]:
def tokenize(text):
    """
    Function to tokenize the text
    """
    tokens=word_tokenize(text)
    lemmatizer=WordNetLemmatizer()
    
    new_tokens=[]
    for tok in tokens:
        new_tok=lemmatizer.lemmatize(tok).lower().strip()
        new_tokens.append(new_tok)
    return new_tokens

### Building a ML pipeline

In [10]:
pipeline=Pipeline([
    ('vect',CountVectorizer(tokenizer=tokenize)),
    ('tfidf',TfidfTransformer()),
    ('clf',MultiOutputClassifier(RandomForestClassifier()))
])

### Train pipeline

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [12]:
pipeline.fit(X_train,y_train)



### Testing the model

In [13]:
y_pred=pipeline.predict(X_test)

In [14]:
def test_model(y_test,y_pred):
    """
    Function to iternate through columns and call classification report on each
    """
    for index, column in enumerate(y_test):
        print(column, classification_report(y_test[column],y_pred[:,index]))

In [15]:
test_model(y_test,y_pred)

related               precision    recall  f1-score   support

           0       0.75      0.28      0.40      1534
           1       0.81      0.97      0.89      5020

    accuracy                           0.81      6554
   macro avg       0.78      0.62      0.65      6554
weighted avg       0.80      0.81      0.77      6554

request               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5395
           1       0.90      0.41      0.56      1159

    accuracy                           0.89      6554
   macro avg       0.89      0.70      0.75      6554
weighted avg       0.89      0.89      0.87      6554

offer               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6528
           1       0.00      0.00      0.00        26

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      655

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


water               precision    recall  f1-score   support

           0       0.95      1.00      0.98      6136
           1       0.89      0.29      0.44       418

    accuracy                           0.95      6554
   macro avg       0.92      0.64      0.71      6554
weighted avg       0.95      0.95      0.94      6554

food               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5850
           1       0.89      0.40      0.55       704

    accuracy                           0.93      6554
   macro avg       0.91      0.70      0.76      6554
weighted avg       0.93      0.93      0.92      6554

shelter               precision    recall  f1-score   support

           0       0.93      1.00      0.96      5942
           1       0.88      0.23      0.37       612

    accuracy                           0.93      6554
   macro avg       0.90      0.62      0.66      6554
weighted avg       0.92      0.93      0.91      6554



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Improving the model

In [16]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x7fedbd475510>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x7fedbd475510>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf_

In [17]:
parameters={
    'clf__estimator__n_estimators':[50,100]
}

In [18]:
cv=GridSearchCV(pipeline,param_grid=parameters)

In [19]:
cv

In [20]:
cv.fit(X_train,y_train)



In [24]:
gen=df.groupby('genre').count()
gen

Unnamed: 0_level_0,id,message,original,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
direct,10766,10766,10170,10766,10766,10766,10766,10766,10766,10766,...,10766,10766,10766,10766,10766,10766,10766,10766,10766,10766
news,13054,13054,0,13054,13054,13054,13054,13054,13054,13054,...,13054,13054,13054,13054,13054,13054,13054,13054,13054,13054
social,2395,2395,0,2395,2395,2395,2395,2395,2395,2395,...,2395,2395,2395,2395,2395,2395,2395,2395,2395,2395
