# Main

In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
import sqlalchemy
from sqlalchemy import create_engine
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split,GridSearchCV
import xgboost
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download(['punkt','stopwords'])
import pickle
from collections import Counter
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
messages_filepath = 'disaster_messages.csv'
categories_filepath = 'disaster_categories.csv'
database_filepath = 'DisasterResponse.db'

In [3]:
def load_data(messages_filepath, categories_filepath):
    
    messages = pd.read_csv(messages_filepath)
    categories = pd.read_csv(categories_filepath)
    df = pd.merge(messages,categories,on = 'id')
    
    return df

In [4]:
def clean_data(df):
    
    #Splitting values in categories column
    categories = df['categories'].str.split(';',expand=True)
    
    #Taking the first row to get column names
    row = categories.iloc[0]  
    category_colnames = row.apply(lambda x: x[:-2])
    categories.columns = category_colnames
    
    for column in categories:
        # setting each value to be the last character of the string
        categories[column] = categories[column].str[-1]
    
        # convert column from string to numeric
        categories[column] = categories[column].astype('int')
        
    #Dropping original categories column and concatting new columns to df
    df.drop('categories', inplace=True, axis=1)
    df = pd.concat([df,categories], axis = 1)
    df.drop_duplicates(inplace=True)
    
    return df

In [5]:
def save_data(df, database_filepath):

    engine = create_engine(f"sqlite:///{database_filepath}")
    df.to_sql("DisasterResponse", engine, if_exists="replace", index=False)
    
    print(f"Data successfully saved to {database_filepath}")

In [6]:
def load_from_db(database_filepath):

    engine = create_engine(f"sqlite:///{database_filepath}")
    df = pd.read_sql_table("DisasterResponse",engine)
    
    X = df['message']
    y = df.drop(['id','message','original','genre'], axis=1)
    category_names = y.columns

    return X,y, category_names

In [7]:
df = load_data(messages_filepath, categories_filepath)

In [8]:
df.head(10)

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...
5,14,Information about the National Palace-,Informtion au nivaux palais nationl,direct,related-0;request-0;offer-0;aid_related-0;medi...
6,15,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct,related-1;request-0;offer-0;aid_related-0;medi...
7,16,"Please, we need tents and water. We are in Sil...",Tanpri nou bezwen tant avek dlo nou zon silo m...,direct,related-1;request-1;offer-0;aid_related-1;medi...
8,17,"I would like to receive the messages, thank you",Mwen ta renmen jouin messag yo. Merci,direct,related-0;request-0;offer-0;aid_related-0;medi...
9,18,I am in Croix-des-Bouquets. We have health iss...,"Nou kwadebouke, nou gen pwoblem sant m yo nan ...",direct,related-1;request-1;offer-0;aid_related-1;medi...


In [9]:
clean_df = clean_data(df)

In [10]:
clean_df.head(10)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,14,Information about the National Palace-,Informtion au nivaux palais nationl,direct,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,15,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
7,16,"Please, we need tents and water. We are in Sil...",Tanpri nou bezwen tant avek dlo nou zon silo m...,direct,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
8,17,"I would like to receive the messages, thank you",Mwen ta renmen jouin messag yo. Merci,direct,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,18,I am in Croix-des-Bouquets. We have health iss...,"Nou kwadebouke, nou gen pwoblem sant m yo nan ...",direct,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1


In [11]:
#Saving clean data to database for future usage
save_data(clean_df, database_filepath)

Data successfully saved to DisasterResponse.db


In [12]:
#Importing saved clean data
X,y,category_columns = load_from_db(database_filepath)

In [13]:
X

0        Weather update - a cold front from Cuba that c...
1                  Is the Hurricane over or is it not over
2                          Looking for someone but no name
3        UN reports Leogane 80-90 destroyed. Only Hospi...
4        says: west side of Haiti, rest of the country ...
                               ...                        
26211    The training demonstrated how to enhance micro...
26212    A suitable candidate has been selected and OCH...
26213    Proshika, operating in Cox's Bazar municipalit...
26214    Some 2,000 women protesting against the conduc...
26215    A radical shift in thinking came about as a re...
Name: message, Length: 26216, dtype: object

In [14]:
y

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26214,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
category_columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [16]:
#Tokenization function for CountVectorizer
def series_tokenizer(pd_series): 
    wt = word_tokenize(re.sub(r'[^a-zA-Z]',' ',pd_series.lower()))
    stemmed_series = [stemmer.stem(i) for i in wt if i not in stopwords.words('english')]
    return stemmed_series

In [17]:
#Normalization function for Word Counter
def word_normalize(text):
    reg = [re.sub(r'[^a-zA-Z]', " ", z.lower()) for z in text]
    token = [word_tokenize(i) for i in reg]
    stem = [[stemmer.stem(i) for i in x if i not in stopwords.words('english')] for x in token]
    final = [" ".join(i) for i in stem]
    
    return final

In [18]:
tfidf = TfidfTransformer()
stemmer = PorterStemmer()
vect = CountVectorizer(tokenizer = series_tokenizer)

In [19]:
#Analyzing frequency of words to create categories as a new feature
def word_counter(text):
    sentences = word_normalize(text)
    joined_text = ' '.join(sentences)
    tokenized_words = joined_text.split()
    word_counts = Counter(tokenized_words)
    
    sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    for word, count in sorted_word_counts:
        print(f'{word}: {count}')

In [20]:
word_counter(X)

water: 3046
peopl: 3017
food: 2904
help: 2858
need: 2755
pleas: 2065
earthquak: 1923
area: 1667
like: 1635
us: 1498
would: 1491
flood: 1439
said: 1351
http: 1258
countri: 1251
thank: 1175
know: 1119
govern: 1116
also: 1114
hous: 1063
rain: 1060
haiti: 1044
inform: 1043
one: 1022
work: 1000
live: 995
find: 971
year: 958
sandi: 931
provid: 928
affect: 886
tent: 885
go: 882
get: 870
includ: 869
aid: 864
famili: 854
region: 852
nation: 847
suppli: 844
good: 840
relief: 813
commun: 790
day: 785
two: 774
health: 753
report: 729
distribut: 709
messag: 697
well: 697
mani: 693
villag: 692
school: 691
give: 681
use: 667
children: 667
caus: 663
state: 657
come: 656
want: 648
emerg: 648
support: 648
co: 644
assist: 636
power: 636
sever: 635
time: 635
shelter: 632
damag: 631
new: 631
million: 628
provinc: 627
home: 624
sinc: 623
local: 616
disast: 611
make: 606
intern: 604
citi: 594
continu: 589
unit: 583
victim: 582
district: 582
road: 574
hit: 562
storm: 559
take: 558
send: 554
receiv: 551
last: 

Words will be chosen based on usage frequency and logic to create new category column (category_list in Category class)

In [21]:
class Category(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.category_list = ['water','food','earthquak','flood','rain','tent','aid','storm','diseas','hurrican'
                           ,'medic','river','tsunami','drought','cyclon','fire','wind','snow','ebola','malaria'
                           ,'mosquito','hurricanesandi']
        self.df = pd.DataFrame({name:[] for name in self.category_list})
        
    def fit(self,X,y=None):
        return self
    
    
    def define_category(self,text):
        text = pd.Series(word_normalize(text))
        concatted_df = pd.concat([text,self.df],axis=1)
        for category in self.category_list:
            for row in range(len(text)):
                if category in text[row]:
                    concatted_df[category].iloc[row] = '1'
                else:
                    concatted_df[category].iloc[row] = '0'
        concatted_df.drop(0, axis=1, inplace=True)
        concatted_df = concatted_df.astype('int') 
        return concatted_df      
    
    def transform(self, X):
        concatted_df = self.define_category(X)
        category_matrix = csr_matrix(concatted_df) #Converting df into matrix to match with the text pipeline results
        return category_matrix


In [22]:
def build_model():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=series_tokenizer)),
                ('tfidf', TfidfTransformer())
            ])),
                #New column processing class for category
                ('category', Category()) 
        ])),
        ('clf', MultiOutputClassifier(XGBClassifier()))
    ])
    
    param_grid = {
            'clf__estimator__learning_rate': [0.01, 0.1],
            'clf__estimator__n_estimators': [100, 250],
            'clf__estimator__max_depth': [3, 5],
            'clf__estimator__base_score': [0.5, 0.75]  
    }
    
    cv = GridSearchCV(pipeline, param_grid)
    return cv

In [23]:
def evaluate_model(y_train, X_pred,y_test, y_pred):
    
    train_classification_matrix = {}
    train_acc_matrix = {}
    
    
    test_classification_matrix = {}
    test_acc_matrix = {}
    
    for i in range(len(y_train.columns)):
        y_true_train = y_train.iloc[:,i]
        y_train_predicted = X_pred[:,i]
        
        train_report = classification_report(y_true_train,y_train_predicted, zero_division=1)
        train_accuracy = accuracy_score(y_true_train,y_train_predicted)
        
        train_classification_matrix[y_train.columns[i]] = train_report
        train_acc_matrix[y_train.columns[i]] = train_accuracy
        
    for column, report in train_classification_matrix.items():
        print(f"Classification Report for {column}:\n{report}\n")
        
    for column, report in train_acc_matrix.items():
        print(f"Accuracy Report for {column}:\n{report}\n")
        
    for i in range(len(y_test.columns)):
        y_test_true = y_test.iloc[:,i]
        y__test_predicted = y_pred[:,i]
        
        test_report = classification_report(y_test_true,y__test_predicted, zero_division=1)
        test_accuracy = accuracy_score(y_test_true,y__test_predicted)

        test_acc_matrix[y_test.columns[i]] = test_accuracy
        test_classification_matrix[y_test.columns[i]] = test_report
        
    for column, report in test_classification_matrix.items():
        print(f"Classification Report for {column}:\n{report}\n")
    
    for column, report in test_acc_matrix.items():
        print(f"Accuracy Report for {column}:\n{report}\n")

In [24]:
def save_model(model):
    with open('model.pkl','wb') as model_file:
        pickle.dump(model,model_file)

In [25]:
def main():
    X, y, category_names = load_from_db(database_filepath)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    print('Building model...')
    model = build_model()
    
    start_time = time.time()
    
    print('Training model...')
    model.fit(X_train, y_train)
    
    end_time = time.time()
    fit_time_seconds = end_time - start_time
    fit_time_minutes, fit_time_seconds = divmod(fit_time_seconds, 60)
    fit_time_hours, fit_time_minutes = divmod(fit_time_minutes, 60)
    print(f'Fit time: {int(fit_time_hours)} hours and {int(fit_time_minutes)} minutes')
    
    best_params = model.best_params_
    print(f"Best parameters: {best_params}")
    model.estimator.set_params(**best_params)
    X_pred = model.predict(X_train)
    y_pred = model.predict(X_test)

    print('Evaluating model...')
    evaluate_model(y_train, X_pred,y_test, y_pred)

    print('Saving model...')
    save_model(model)

    print('Trained model saved!')

In [26]:
main()

Building model...
Training model...
Fit time: 9 hours and 7 minutes
Best parameters: {'clf__estimator__base_score': 0.5, 'clf__estimator__learning_rate': 0.1, 'clf__estimator__max_depth': 5, 'clf__estimator__n_estimators': 250}
Evaluating model...
Classification Report for related:
              precision    recall  f1-score   support

           0       0.84      0.43      0.57      4654
           1       0.84      0.97      0.90     14867
           2       0.99      0.47      0.63       141

    accuracy                           0.84     19662
   macro avg       0.89      0.63      0.70     19662
weighted avg       0.84      0.84      0.82     19662


Classification Report for request:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     16320
           1       0.91      0.61      0.73      3342

    accuracy                           0.92     19662
   macro avg       0.92      0.80      0.84     19662
weighted avg       0.92     