In [444]:
import pandas as pd
import re
from sqlalchemy import create_engine
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

### Import libraries and load data from database.

In [445]:
# load data from database
engine = create_engine('sqlite:///data/DisasterResponse.db')

In [446]:
df = pd.read_sql_table('messages', engine)

### Data Preparation

'@' mention. Even tough @ adds some information to the message, this information doesn't add value build the classifcation model

In [208]:
df['message_clean'] = df.message.apply(lambda x: re.sub(r'@[A-Za-z0-9_]+','',str(x)))

Dealing with URL links

In [209]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
df['message_clean'] = df.message_clean.apply(lambda x: re.sub(url_regex,'urlplaceholder',str(x)))

In [210]:
# A log of url are write as follows: http bit.ly. Apply Regex for these cases
utl_regex_2 = 'http [a-zA-Z]+\.[a-zA-Z]+'
df['message_clean'] = df.message_clean.apply(lambda x: re.sub(utl_regex_2,'urlplaceholder',str(x)))

In [276]:
# Other formats: http : //t.co/ihW64e8Z
utl_regex_3 = 'http \: //[a-zA-Z]\.(co|com|pt|ly)/[A-Za-z0-9_]+'

In [277]:
df['message_clean'] = df.message_clean.apply(lambda x: re.sub(utl_regex_3,'urlplaceholder',str(x)))

Hashtags can provide useful informations. Removing only ``#``

In [318]:
df['message_clean'] = df.message_clean.apply(lambda x: re.sub('#',' ',str(x)))

In [315]:
re.sub('#', ' ', test)

"New Yorkers : Come volunteer w/ us on LES at 46 Hestor 10 AM . Delivering meds to elderly still in highrises being flashlight .  SandyHelp' "

In [333]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [334]:
def tokenize(text):
    # '@' mention. Even tough @ adds some information to the message, 
    # this information doesn't add value build the classifcation model
    text = re.sub(r'@[A-Za-z0-9_]+','', text)
    
    # Dealing with URL links
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_regex,'urlplaceholder', text)
    # A lot of url are write as follows: http bit.ly. Apply Regex for these cases
    text = re.sub(utl_regex_2,'urlplaceholder', text)
    # Other formats: http : //t.co/ihW64e8Z
    utl_regex_3 = 'http \: //[a-zA-Z]\.(co|com|pt|ly)/[A-Za-z0-9_]+'
    text = re.sub(utl_regex_3,'urlplaceholder', text)
    
    # Hashtags can provide useful informations. Removing only ``#``
    text = re.sub('#',' ', text)
    
    text = text.lower()
    # Ponctuation Removal
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    tokens = [tok for tok in tokens if tok not in stop_words]
    return tokens
    

In [390]:
output_columns = ['related', 'request', 'offer', 'aid_related',
       'medical_help', 'medical_products', 'search_and_rescue', 'security',
       'military', 'child_alone', 'water', 'food', 'shelter', 'clothing',
       'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']

In [392]:
X = df['message']
y = df[output_columns]

In [398]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [396]:
def build_model():
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    return pipeline

In [400]:
pipe = build_model()

In [401]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

In [403]:
y_pred = pipe.predict(X_test)

In [441]:
for col_idx, col in enumerate(output_columns):
    print(f'{col} accuracy \n')
    print(classification_report(y_test[col], y_pred[:, col_idx]))

related accuracy 

              precision    recall  f1-score   support

           0       0.73      0.40      0.52      1892
           1       0.83      0.95      0.89      5899
           2       0.88      0.12      0.21        58

    accuracy                           0.81      7849
   macro avg       0.81      0.49      0.54      7849
weighted avg       0.80      0.81      0.79      7849

request accuracy 

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      6522
           1       0.82      0.49      0.61      1327

    accuracy                           0.89      7849
   macro avg       0.86      0.73      0.77      7849
weighted avg       0.89      0.89      0.88      7849

offer accuracy 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7818
           1       0.00      0.00      0.00        31

    accuracy                           1.00      7849
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      7457
           1       0.82      0.12      0.21       392

    accuracy                           0.95      7849
   macro avg       0.89      0.56      0.59      7849
weighted avg       0.95      0.95      0.94      7849

electricity accuracy 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      7694
           1       1.00      0.03      0.05       155

    accuracy                           0.98      7849
   macro avg       0.99      0.51      0.52      7849
weighted avg       0.98      0.98      0.97      7849

tools accuracy 

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7805
           1       0.00      0.00      0.00        44

    accuracy                           0.99      7849
   macro avg       0.50      0.50      0.50      7849
weighted avg       0.99      0.99 

In [439]:
y_pred_train = pipe.predict(X_train)

In [440]:
for col_idx, col in enumerate(output_columns):
    print(f'{col} accuracy \n')
    print(accuracy_score(y_pred_train[:, col_idx], y_train[col]))

related accuracy 

0.998416425490089
request accuracy 

0.9992901217714192
offer accuracy 

0.9998907879648338
aid_related accuracy 

0.9989078796483373
medical_help accuracy 

0.9997815759296674
medical_products accuracy 

0.9998361819472505
search_and_rescue accuracy 

0.9998361819472505
security accuracy 

0.9996723638945012
military accuracy 

0.9998907879648338
child_alone accuracy 

1.0
water accuracy 

1.0
food accuracy 

0.9998907879648338
shelter accuracy 

0.9998361819472505
clothing accuracy 

0.9998361819472505
money accuracy 

0.9999453939824169
missing_people accuracy 

0.9998361819472505
refugees accuracy 

0.9998361819472505
death accuracy 

0.9998361819472505
other_aid accuracy 

0.9990170916835035
infrastructure_related accuracy 

0.9997815759296674
transport accuracy 

0.9996723638945012
buildings accuracy 

1.0
electricity accuracy 

0.9998907879648338
tools accuracy 

0.9999453939824169
hospitals accuracy 

0.9999453939824169
shops accuracy 

0.9998907879648338
aid

In [None]:
print(classification_report(y_test, y_pred,target_names=my_tags))