# Main

In [2]:
import pandas as pd
import numpy as np
import re
import sklearn
import sqlalchemy
from sqlalchemy import create_engine
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split,GridSearchCV
#import xgboost
#from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from IPython.display import display
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
messages_filepath = 'C:/DisasterResponse/disaster_messages.csv'
categories_filepath = 'C:/DisasterResponse/disaster_categories.csv'
database_filepath = 'C:/DisasterResponse/DisasterResponse.db'

In [16]:
def load_data(messages_filepath, categories_filepath):
    
    messages = pd.read_csv(messages_filepath)
    categories = pd.read_csv(categories_filepath)
    df = pd.merge(messages,categories,on = 'id')
    
    return df

In [17]:
def clean_data(df):
    
    #Splitting values in categories column
    categories = df['categories'].str.split(';',expand=True)
    
    #Taking the first row to get column names
    row = categories.iloc[0]  
    category_colnames = row.apply(lambda x: x[:-2])
    categories.columns = category_colnames
    
    for column in categories:
        # setting each value to be the last character of the string
        categories[column] = categories[column].str[-1]
    
        # convert column from string to numeric
        categories[column] = categories[column].astype('int')
        
    #Dropping original categories column and concatting new columns to df
    df.drop('categories', inplace=True, axis=1)
    df = pd.concat([df,categories], axis = 1)
    df.drop_duplicates(inplace=True)
    
    return df

In [18]:
def save_data(df, database_filepath):

    engine = create_engine(f"sqlite:///{database_filepath}")
    df.to_sql("DisasterResponse", engine, if_exists="replace", index=False)
    
    print(f"Data successfully saved to {database_filepath}")

In [19]:
def load_from_db(database_filepath):

    engine = create_engine(f"sqlite:///{database_filepath}")
    df = pd.read_sql_table("DisasterResponse",engine)
    
    X = df['message']
    y = df.drop(['id','message','original','genre'], axis=1)
    category_names = y.columns

    return X,y, category_names

In [20]:
df = load_data(messages_filepath, categories_filepath)

In [21]:
df.head(10)

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...
5,14,Information about the National Palace-,Informtion au nivaux palais nationl,direct,related-0;request-0;offer-0;aid_related-0;medi...
6,15,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct,related-1;request-0;offer-0;aid_related-0;medi...
7,16,"Please, we need tents and water. We are in Sil...",Tanpri nou bezwen tant avek dlo nou zon silo m...,direct,related-1;request-1;offer-0;aid_related-1;medi...
8,17,"I would like to receive the messages, thank you",Mwen ta renmen jouin messag yo. Merci,direct,related-0;request-0;offer-0;aid_related-0;medi...
9,18,I am in Croix-des-Bouquets. We have health iss...,"Nou kwadebouke, nou gen pwoblem sant m yo nan ...",direct,related-1;request-1;offer-0;aid_related-1;medi...


In [22]:
clean_df = clean_data(df)

In [23]:
clean_df.head(10)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,14,Information about the National Palace-,Informtion au nivaux palais nationl,direct,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,15,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
7,16,"Please, we need tents and water. We are in Sil...",Tanpri nou bezwen tant avek dlo nou zon silo m...,direct,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
8,17,"I would like to receive the messages, thank you",Mwen ta renmen jouin messag yo. Merci,direct,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,18,I am in Croix-des-Bouquets. We have health iss...,"Nou kwadebouke, nou gen pwoblem sant m yo nan ...",direct,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1


In [24]:
#Saving clean data to database for future usage
save_data(clean_df, database_filepath)

Data successfully saved to C:/DisasterResponse/DisasterResponse.db


In [25]:
#Importing saved clean data
X,y,category_columns = load_from_db(database_filepath)

In [26]:
X

0        Weather update - a cold front from Cuba that c...
1                  Is the Hurricane over or is it not over
2                          Looking for someone but no name
3        UN reports Leogane 80-90 destroyed. Only Hospi...
4        says: west side of Haiti, rest of the country ...
                               ...                        
26211    The training demonstrated how to enhance micro...
26212    A suitable candidate has been selected and OCH...
26213    Proshika, operating in Cox's Bazar municipalit...
26214    Some 2,000 women protesting against the conduc...
26215    A radical shift in thinking came about as a re...
Name: message, Length: 26216, dtype: object

In [27]:
y

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26214,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
category_columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [29]:
#Tokenization function for CountVectorizer
def series_tokenizer(pd_series): 
    wt = word_tokenize(re.sub(r'[^a-zA-Z]',' ',pd_series.lower()))
    stemmed_series = [stemmer.stem(i) for i in wt if i not in stopwords.words('english')]
    return stemmed_series

In [30]:
vect = CountVectorizer(tokenizer = series_tokenizer)

In [36]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown = 'ignore')

In [31]:
#Normalization function for Word Counter
def word_normalize(text):
    reg = [re.sub(r'[^a-zA-Z]', " ", z.lower()) for z in text]
    token = [word_tokenize(i) for i in reg]
    stem = [[stemmer.stem(i) for i in x if i not in stopwords.words('english')] for x in token]
    final = [" ".join(i) for i in stem]
    
    return final

In [32]:
tfidf = TfidfTransformer()
stemmer = PorterStemmer()

In [42]:
class Category(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.category_list = []
    
    def word_counter(self, text):
        sentences = word_normalize(text)
        joined_text = ' '.join(sentences)
        tokenized_words = joined_text.split()
        word_counts = Counter(tokenized_words)
        
        sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        for word, count in sorted_word_counts:
            print(f'{word}: {count}')

    def define_category(self, text):
        for category in self.category_list:
            if category in text:
                return category
        return 'other'
        
    def fit(self, X, y=None):
        self.category_list = ['water','food','earthquak','flood','rain','tent','aid','storm','diseas','hurrican'
                           ,'medic','river','tsunami','drought','cyclon','fire','wind','snow','ebola','malaria'
                           ,'mosquito','hurricanesandi']
        return self
    
    def transform(self, X):
        X = X.apply(series_tokenizer)
        categories = X.apply(self.define_category)
        df = pd.DataFrame({'category':categories})
        #final = encoder.fit_transform(categories)
        return df

In [34]:

def build_model2():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=series_tokenizer)),
                ('tfidf', TfidfTransformer())
            ])),
            ('category', Category())  # Use your Category transformer here
        ])),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    param_grid = {
        'clf__estimator__n_estimators': [100, 300]
# #         ,
# #         'clf__estimator__criterion': ['gini', 'entropy'],
# #         'clf__estimator__max_depth': [10, 30],
# #         'clf__estimator__min_samples_split': [2, 7],
    }
    
    cv = GridSearchCV(pipeline, param_grid)
    return cv


# Function Test

In [37]:
X1 = X[:250]

In [38]:
y1 = y[:260]

In [39]:
X1

0      Weather update - a cold front from Cuba that c...
1                Is the Hurricane over or is it not over
2                        Looking for someone but no name
3      UN reports Leogane 80-90 destroyed. Only Hospi...
4      says: west side of Haiti, rest of the country ...
                             ...                        
245               survivors at lalue near olympic market
246    Us, the people of Matye 3, in the Grand rivye ...
247    I want to make a call, I can't, there is a cro...
248    People don't have no food neither, they would ...
249    their is someone that left their home and neve...
Name: message, Length: 250, dtype: object

In [40]:
y1

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
256,1,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
257,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
258,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
category = Category()

In [44]:
ct = category.fit_transform(X1)

In [45]:
ct

Unnamed: 0,category
0,other
1,hurrican
2,other
3,other
4,other
...,...
245,other
246,other
247,other
248,food


In [46]:
encoded = enc.fit_transform(ct)

In [47]:
encoded

<250x10 sparse matrix of type '<class 'numpy.float64'>'
	with 250 stored elements in Compressed Sparse Row format>

In [48]:
pd.DataFrame(encoded.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
245,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
246,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
247,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
248,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
