# ETL Pipelines

In [47]:
import time
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine

In [48]:
dis_cat_file = 'Data/disaster_categories.csv'
dis_mes_file = 'Data/disaster_messages.csv'

In [49]:
def load_clean_data(file1, file2):
    dis_cat_df_raw = pd.read_csv(file1)
    dis_mes_df_raw = pd.read_csv(file2)
    
    
    #display('catgories information')
    #display(dis_cat_df_raw.info())
    #display(dis_cat_df_raw.head())

    #display('message information')
    #display(dis_mes_df_raw.info())
    #display(dis_mes_df_raw.head())
    
    df_merge_raw = dis_cat_df_raw.merge(dis_mes_df_raw, on='id')
    #display('merge data')
    #display(df_merge_raw.head())
    
    one_hot_cat = df_merge_raw.categories.str.split(';', expand=True)
    
    
    cat_columns = list(map(lambda x: x[:-2], one_hot_cat.iloc[:].values[0]))
    one_hot_cat.columns = cat_columns
    for column in cat_columns:
        one_hot_cat[column] = one_hot_cat[column].apply(lambda x:x[-1])
        
    df_merge_raw.drop(['categories'], axis=1, inplace=True)
    df_one_hot_ok = pd.concat([df_merge_raw, one_hot_cat], axis=1)
    #display('DataFrame has been loaded!')
    #display(df_one_hot_ok.info())
    #display(df_one_hot_ok.head())
    
    df_one_hot_ok = df_one_hot_ok.drop_duplicates() # only one step for cleaning data (drop duplicates)
    return df_one_hot_ok


def save_data(df, database_filename):
   
    engine = create_engine('sqlite:///'+database_filename)
    df.to_sql('dataset', engine, index=False)

In [50]:
def main():
    
    # if len(sys.argv) == 4:
    messages_filepath, categories_filepath, database_filepath = dis_cat_file, dis_mes_file, 'Data/CleanedDiseaseResponse.db'
    
    try:


        print('Loading and Cleaning data')
        for i in range(random.randrange(1,10,1)):
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
        df_loaded = load_clean_data(categories_filepath, messages_filepath)
        print('\n')
        print('Data has been loaded and cleaned!!!!')
        print(df_loaded.info())
        print(df_loaded.head())

    except:
        raise ValueError("Load Data Error!")

    try:
        print('Saving data')
        for i in range(random.randrange(1,10,1)):
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
            time.sleep(0.4)
            print('.', end='')
        save_data(df_loaded, database_filepath)
        print('\n')
        print('Data has been Saved!!!!')
    except:
        raise ValueError("Save Data Error!")

# Machine Learning Pipelines

In [2]:
import sys
import pandas as pd
import numpy as np
import nltk


from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

from sqlalchemy import create_engine

import pickle

[nltk_data] Downloading package omw-1.4 to /Users/oo/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package punkt to /Users/oo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/oo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/oo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def load_data(database_filepath):
    
    engine = create_engine('sqlite:///'+database_filepath)
    df = pd.read_sql_table('dataset', con=engine)

    
    df = df.dropna() 

    X = df.loc[:, 'message']
    Y = df.iloc[:, 4:]
    categories = list(Y)

    return X.values, Y.values, categories, df

In [4]:
database_filepath = 'Data/CleanedDiseaseResponse.db'
X, y, cat, df = load_data(database_filepath)

In [5]:
df

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that could pass over Haiti,Un front froid se retrouve sur Cuba ce matin. Il pourrait traverser Haiti demain. Des averses de pluie isolee sont encore prevues sur notre region ce soi,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ak timoun yo. Mesi se john jean depi Monben kwochi.",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country today and tonight",facade ouest d Haiti et le reste du pays aujourd hui et ce soir,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12349,14671,Ghambat: Khairpur: 100s of villages submerged in water:,GHAMBAT: KHAIRPUR: 100S GAVON MEIN PAANI MEY JALMAGNA HO.,direct,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
12350,14673,Ghambat: Sindh: Volunteers needed : @ http://bit.ly/daU6ID,Ghambat: Sindh: Razakaroon ki zaroorat hai : @ http://bit.ly/daU6ID,direct,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12351,14675,"shikarpur: Volunteers needed, food and medical help needed","shikarpur mein swanyasevak ki avyashaktha he.khana,dhavainyon ki zakth zaroorath hein.",direct,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
12352,14676,"Dadu: Needs water, food, medicines. boats","JO KO KHANA, PAANI, DAWAYIAN, NAOV CHAHIYE.",direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1


In [6]:

def tokenize(text):
    
    tokens = word_tokenize(text)
    wl = WordNetLemmatizer()
   
    tokens = [wl.lemmatize(t).lower().strip() for t in tokens]

    return tokens

def build_model(parameters={}):
    
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer = tokenize)),
                            ('tfidf', TfidfTransformer()),
                            ('classifier', MultiOutputClassifier(RandomForestClassifier(**parameters)))])
    return pipeline

def optimal_params(model, X_train, Y_train):
    
    parameters = {
        'classifier__estimator__n_estimators': [50, 100, 150],
        'classifier__estimator__max_features': ['sqrt',],
        'classifier__estimator__criterion': ['entropy', 'gini']
    }

    cv = GridSearchCV(model, param_grid = parameters, verbose=1)
    cv.fit(X_train, Y_train)

    return cv.best_params_

def evaluate_model(model, X_test, Y_test, category_names):
    
    predictions = model.predict(X_test)
    
   
    print("Accuracy scores for each category\n")
    print("*-" * 30)

    for i in range(36):
        print("Category:", category_names[i],"\n", classification_report(Y_test[:, i], predictions[:, i]))
        
def save_model(model, model_filepath):
    
    pickle.dump(model, open(model_filepath, "wb"))
    
def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

        print('Building model...')
        model = build_model()
        
        print('Grid search started')
        optimal_parameters = optimal_params(model, X_train, Y_train)
        random_forest_params = {
                    'n_estimators': optimal_parameters['classifier__estimator__n_estimators'],
                    'max_features': optimal_parameters['classifier__estimator__max_features'],
                    'criterion': optimal_parameters['classifier__estimator__criterion'],
                }

        print("Optimal parameters")
        print(random_forest_params)

        print('Building random forest model with optimal parameters')
        model = build_model(random_forest_params)

        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')