In [18]:
import sys
import pandas as pd 
import numpy as np

from sqlalchemy import create_engine

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

import pickle
import re

from xgboost import XGBClassifier

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import time

try:
    nltk.data.find("tokenizers/punkt")
    nltk.data.find("tokenizers/punkt")
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
    nltk.download("stopwords")
    nltk.download("wordnet")

random_state = 122

def load_data(database_filepath):
    
    engine = create_engine(f"sqlite:///../data/{database_filepath}")
    df = pd.read_sql_table("disaster_messages", engine )
    
    X = df["message"]
    Y = df.drop(columns=["id", "message", "original", "genre"])
    category_names = Y.columns
    Y = Y.values
    
    return X, Y, category_names


def tokenize(text):

    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    
    url_regex = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    tokens = [ lemmatizer.lemmatize(word).strip() for word in word_tokenize(text.lower()) if word.isalnum() and word not in stop_words]
    return tokens


def evaluate_model(model, X_test, Y_test, category_names):
    
    print(f"Evaluating the model on test data ....")
    
    Y_pred = model.predict(X_test,Y_test)
    
    # Calculate the accuracy for each of them.
    for i in range(len(category_names)):
        print("Category:", category_names[i],"\n", classification_report(Y_test.iloc[:, i].values, Y_pred[:, i]))
        print("Accuracy of %25s: %.2f" %(category_names[i], accuracy_score(Y_test.iloc[:, i].values, Y_pred[:,i])))

def build_model():
    
    print(f"Using XGBclassifier ...")
    classifier = XGBClassifier()
    
    nlp_pipeline = Pipeline([ ("count_vect", CountVectorizer(tokenizer=tokenize)),("tf_idf_transf" ,TfidfTransformer() ) ])
    ml_pipeline = Pipeline([ ("feature_extractor" , nlp_pipeline) , ("classifier" , MultiOutputClassifier(classifier)) ])


    print(f"Searching for best parameters...")
    
    parameters = {"feature_extractor__count_vect__ngram_range": ((1, 1), (1, 2)) ,
        "feature_extractor__count_vect__max_df": (0.5, 0.75, 1.0) ,
        "feature_extractor__tf_idf_transf__use_idf": (True, False) ,
        "classifier__estimator__learning_rate"    : [0.01, 0.001] ,
        "classifier__estimator__max_depth"        : [ 6, 8, 9],
    }

    model = GridSearchCV(ml_pipeline, param_grid = parameters)
    
    return model

def save_model(model, model_filepath):
    
    print(f"Saving the model ... ")
    joblib.dump(model, model_filepath)



## Running the model 

In [6]:
database_filepath, model_filepath = "disaster_response_detector.db", "XGB_model_V2.joblib.dat"


print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)



Loading data...
    DATABASE: disaster_response_detector.db


In [None]:
print('Building model...')
model = build_model()

print('Training model...')
st = time.perf_counter()
model.fit(X_train, Y_train)
end = time.perf_counter()
print(f" Total training time : {(end-st)/60 :.2f}")



Building model...
Using XGBclassifier ...
Searching for best parameters...
Training model...


In [None]:
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')