# ETL Pipelines

In [47]:
import time
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine

In [48]:
dis_cat_file = 'Data/disaster_categories.csv'
dis_mes_file = 'Data/disaster_messages.csv'

In [49]:
def load_clean_data(file1, file2):
    dis_cat_df_raw = pd.read_csv(file1)
    dis_mes_df_raw = pd.read_csv(file2)
    
    
    #display('catgories information')
    #display(dis_cat_df_raw.info())
    #display(dis_cat_df_raw.head())

    #display('message information')
    #display(dis_mes_df_raw.info())
    #display(dis_mes_df_raw.head())
    
    df_merge_raw = dis_cat_df_raw.merge(dis_mes_df_raw, on='id')
    #display('merge data')
    #display(df_merge_raw.head())
    
    one_hot_cat = df_merge_raw.categories.str.split(';', expand=True)
    
    
    cat_columns = list(map(lambda x: x[:-2], one_hot_cat.iloc[:].values[0]))
    one_hot_cat.columns = cat_columns
    for column in cat_columns:
        one_hot_cat[column] = one_hot_cat[column].apply(lambda x:x[-1])
        
    df_merge_raw.drop(['categories'], axis=1, inplace=True)
    df_one_hot_ok = pd.concat([df_merge_raw, one_hot_cat], axis=1)
    #display('DataFrame has been loaded!')
    #display(df_one_hot_ok.info())
    #display(df_one_hot_ok.head())
    
    df_one_hot_ok = df_one_hot_ok.drop_duplicates() # only one step for cleaning data (drop duplicates)
    return df_one_hot_ok


def save_data(df, database_filename):
   
    engine = create_engine('sqlite:///'+database_filename)
    df.to_sql('dataset', engine, index=False)

In [50]:
def main():
    
    # if len(sys.argv) == 4:
    messages_filepath, categories_filepath, database_filepath = dis_cat_file, dis_mes_file, 'Data/CleanedDiseaseResponse.db'
    
    try:


        print('Loading and Cleaning data')
        for i in range(random.randrange(1,10,1)):
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
        df_loaded = load_clean_data(categories_filepath, messages_filepath)
        print('\n')
        print('Data has been loaded and cleaned!!!!')
        print(df_loaded.info())
        print(df_loaded.head())

    except:
        raise ValueError("Load Data Error!")

    try:
        print('Saving data')
        for i in range(random.randrange(1,10,1)):
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
        save_data(df_loaded, database_filepath)
        print('\n')
        print('Data has been Saved!!!!')
    except:
        raise ValueError("Save Data Error!")

# Machine Learning Pipelines

In [28]:
import sys
import pandas as pd
import numpy as np
import nltk


from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

from sqlalchemy import create_engine

import pickle

In [33]:
def load_data(database_filepath):
    
    engine = create_engine('sqlite:///'+database_filepath)
    df = pd.read_sql_table('dataset', con=engine)

    
    df = df.dropna() 

    X = df.loc[:, 'message']
    Y = df.iloc[:, 4:]
    categories = list(Y)

    return X.values, Y.values, categories

In [34]:

def tokenize(text):
    
    tokens = word_tokenize(text)
    wl = WordNetLemmatizer()
   
    tokens = [wl.lemmatize(t).lower().strip() for t in tokens]

    return tokens

def build_model(parameters={}):
    
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer = tokenize)),
                            ('tfidf', TfidfTransformer()),
                            ('classifier', MultiOutputClassifier(RandomForestClassifier(**parameters)))])
    return pipeline

def optimal_params(model, X_train, Y_train):
    
    parameters = {
        'classifier__estimator__n_estimators': [50, 100, 150],
        'classifier__estimator__max_features': ['sqrt',],
        'classifier__estimator__criterion': ['entropy', 'gini']
    }

    cv = GridSearchCV(model, param_grid = parameters, verbose=1)
    cv.fit(X_train, Y_train)

    return cv.best_params_

def evaluate_model(model, X_test, Y_test, category_names):
    
    predictions = model.predict(X_test)
    
   
    print("Accuracy scores for each category\n")
    print("*-" * 30)

    for i in range(36):
        print("Category:", category_names[i],"\n", classification_report(Y_test[:, i], predictions[:, i]))
        
def save_model(model, model_filepath):
    
    pickle.dump(model, open(model_filepath, "wb"))
    
def main():
    

    database_filepath, model_filepath = 'Data/CleanedDiseaseResponse.db', 'models/classifier.pkl'
    print('Loading data...\n    DATABASE: {}'.format(database_filepath))
    X, Y, category_names = load_data(database_filepath)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    print('Building model...')
    model = build_model()

    print('Grid search started')
    optimal_parameters = optimal_params(model, X_train, Y_train)
    random_forest_params = {
                'n_estimators': optimal_parameters['classifier__estimator__n_estimators'],
                'max_features': optimal_parameters['classifier__estimator__max_features'],
                'criterion': optimal_parameters['classifier__estimator__criterion'],
            }

    print("Optimal parameters")
    print(random_forest_params)

    print('Building random forest model with optimal parameters')
    model = build_model(random_forest_params)

    print('Training model...')
    model.fit(X_train, Y_train)

    print('Evaluating model...')
    evaluate_model(model, X_test, Y_test, category_names)

    print('Saving model...\n    MODEL: {}'.format(model_filepath))
    save_model(model, model_filepath)

    print('Trained model saved!')

main()

Loading data...
    DATABASE: Data/CleanedDiseaseResponse.db
Building model...
Grid search started
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Optimal parameters
{'n_estimators': 150, 'max_features': 'sqrt', 'criterion': 'gini'}
Building random forest model with optimal parameters
Training model...
Evaluating model...
Accuracy scores for each category

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
Category: related 
               precision    recall  f1-score   support

           0       0.72      0.57      0.64       651
           1       0.80      0.90      0.85      1355
           2       0.70      0.25      0.37        28

    accuracy                           0.78      2034
   macro avg       0.74      0.57      0.62      2034
weighted avg       0.78      0.78      0.77      2034

Category: request 
               precision    recall  f1-score   support

           0       0.83      0.93      0.88      1280
           1       0.86      0.68     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: medical_products 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1946
           1       1.00      0.02      0.04        88

    accuracy                           0.96      2034
   macro avg       0.98      0.51      0.51      2034
weighted avg       0.96      0.96      0.94      2034

Category: search_and_rescue 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      2000
           1       0.00      0.00      0.00        34

    accuracy                           0.98      2034
   macro avg       0.49      0.50      0.50      2034
weighted avg       0.97      0.98      0.97      2034

Category: security 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      2011
           1       0.00      0.00      0.00        23

    accuracy                           0.99      2034
   macro avg       0.49      0.50      0.50     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: water 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1867
           1       0.96      0.51      0.66       167

    accuracy                           0.96      2034
   macro avg       0.96      0.75      0.82      2034
weighted avg       0.96      0.96      0.95      2034

Category: food 
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      1719
           1       0.94      0.59      0.73       315

    accuracy                           0.93      2034
   macro avg       0.93      0.79      0.84      2034
weighted avg       0.93      0.93      0.92      2034

Category: shelter 
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      1805
           1       0.90      0.33      0.49       229

    accuracy                           0.92      2034
   macro avg       0.91      0.66      0.72      2034
weighted avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: refugees 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1997
           1       0.00      0.00      0.00        37

    accuracy                           0.98      2034
   macro avg       0.49      0.50      0.50      2034
weighted avg       0.96      0.98      0.97      2034

Category: death 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1985
           1       1.00      0.02      0.04        49

    accuracy                           0.98      2034
   macro avg       0.99      0.51      0.51      2034
weighted avg       0.98      0.98      0.97      2034

Category: other_aid 
               precision    recall  f1-score   support

           0       0.85      1.00      0.92      1720
           1       0.40      0.01      0.01       314

    accuracy                           0.85      2034
   macro avg       0.62      0.50      0.46      2034
weighted avg 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: electricity 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      2019
           1       0.00      0.00      0.00        15

    accuracy                           0.99      2034
   macro avg       0.50      0.50      0.50      2034
weighted avg       0.99      0.99      0.99      2034

Category: tools 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2026
           1       0.00      0.00      0.00         8

    accuracy                           1.00      2034
   macro avg       0.50      0.50      0.50      2034
weighted avg       0.99      1.00      0.99      2034

Category: hospitals 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      2017
           1       0.00      0.00      0.00        17

    accuracy                           0.99      2034
   macro avg       0.50      0.50      0.50      2034
weighted a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: other_infrastructure 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1996
           1       0.00      0.00      0.00        38

    accuracy                           0.98      2034
   macro avg       0.49      0.50      0.50      2034
weighted avg       0.96      0.98      0.97      2034

Category: weather_related 
               precision    recall  f1-score   support

           0       0.92      0.99      0.95      1744
           1       0.88      0.50      0.64       290

    accuracy                           0.92      2034
   macro avg       0.90      0.75      0.80      2034
weighted avg       0.92      0.92      0.91      2034

Category: floods 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1980
           1       1.00      0.20      0.34        54

    accuracy                           0.98      2034
   macro avg       0.99      0.60      0.66     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category: cold 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      2020
           1       1.00      0.07      0.13        14

    accuracy                           0.99      2034
   macro avg       1.00      0.54      0.57      2034
weighted avg       0.99      0.99      0.99      2034

Category: other_weather 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1983
           1       1.00      0.02      0.04        51

    accuracy                           0.98      2034
   macro avg       0.99      0.51      0.51      2034
weighted avg       0.98      0.98      0.96      2034

Category: direct_report 
               precision    recall  f1-score   support

           0       0.82      0.93      0.87      1304
           1       0.84      0.64      0.73       730

    accuracy                           0.83      2034
   macro avg       0.83      0.79      0.80      2034
weigh