In [None]:
pip install tidytext textblob

In [1]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 200)
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from Help_Funs import count_chars, count_words, count_capital_chars, count_capital_words, count_sent, count_unique_words, count_stopwords, count_hashtags 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

## Defining files names
file_key_1 = 'NLP-Disaster-Tweets/train.csv'
file_key_2 = 'NLP-Disaster-Tweets/test.csv'
file_key_3 = 'NLP-Disaster-Tweets/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
sample = pd.read_csv(file_content_stream_3)

# Basic Exploration

In [10]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [23]:
train['text'][40]

'Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw'

In [24]:
train.loc[40]

keyword                                                          ablaze
text                  Check these out: http://t.co/rOI2NSmEJJ http:/...
target                                                                0
char_count                                                          114
word_count                                                            8
sent_count                                                            1
capital_char_count                                                   23
capital_word_count                                                    0
stopword_count                                                        2
unique_word_count                                                     8
avg_wordlength                                                    14.25
avg_sentlength                                                      8.0
unique_vs_words                                                     1.0
stopwords_vs_words                                              

In [3]:
train['target'].value_counts() / train.shape[0]

0    0.57034
1    0.42966
Name: target, dtype: float64

In [4]:
train['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [5]:
test['keyword'].value_counts()

deluged               23
demolished            22
rubble                22
first%20responders    21
seismic               21
                      ..
threat                 5
fatalities             5
forest%20fire          5
inundation             4
epicentre              1
Name: keyword, Length: 221, dtype: int64

In [6]:
np.isin(test['keyword'].unique(), train['keyword'].unique())

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [7]:
test['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [8]:
train['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [None]:
train['location'] = np.where(train['location'] == 'United States', 'USA', train['location'])
train['location'] = np.where(train['location'] == 'US', 'USA', train['location'])

In [9]:
train['location'].value_counts()

USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: location, Length: 3341, dtype: int64

In [15]:
train.shape

(7613, 14)

In [28]:
train['location'].unique().tolist()

[nan,
 'Birmingham',
 'Est. September 2012 - Bristol',
 'AFRICA',
 'Philadelphia, PA',
 'London, UK',
 'Pretoria',
 'World Wide!!',
 'Paranaque City',
 'Live On Webcam',
 'milky way',
 'GREENSBORO,NORTH CAROLINA',
 'England.',
 'Sheffield Township, Ohio',
 'India',
 'Barbados',
 'Anaheim',
 'Abuja',
 'USA',
 'South Africa',
 'Sao Paulo, Brazil',
 'hollywoodland ',
 'Edmonton, Alberta - Treaty 6',
 'Inang Pamantasan',
 'Twitter Lockout in progress',
 'Concord, CA',
 'Calgary, AB',
 'San Francisco',
 'CLVLND',
 'Nashville, TN',
 'Santa Clara, CA',
 'UK',
 'St. Louis, MO',
 'Walker County, Alabama',
 'Australia',
 'North Carolina',
 'Norf Carolina',
 'San Mateo County, CA',
 'Njoro, Kenya',
 "Your Sister's Bedroom",
 'Arlington, TX',
 'South Bloomfield, OH',
 'New Hanover County, NC',
 'Maldives',
 'Manchester, NH',
 'Wilmington, NC',
 'global',
 'Alberta | Sask. | Montana',
 'Charlotte',
 'Baton Rouge, LA',
 'Hagerstown, MD',
 'Gloucestershire , UK',
 'Nairobi, Kenya',
 'Instagram - @hey

In [26]:
pd.crosstab(train['location'], train['target'])

target,0,1
location,Unnamed: 1_level_1,Unnamed: 2_level_1
,0,1
Glasgow,1,0
"Melbourne, Australia",1,0
News,0,1
å_,1,0
...,...,...
å_: ?? ÌÑ ? : ?,1,0
å_å_Los Mina Cityã¢,1,0
å¡å¡Midwest Û¢Û¢,1,0
åÊ(?Û¢`?Û¢å«)??,1,0


In [27]:
test['location'].value_counts()

New York                  38
USA                       37
Worldwide                 16
United States             15
London                    13
                          ..
Medford, NJ                1
Quezon City                1
LanÌ¼s                     1
USA,Washington,Seattle     1
Brussels, Belgium          1
Name: location, Length: 1602, dtype: int64

In [28]:
np.isin(test['location'].unique(), train['location'].unique())

array([False,  True, False, ..., False,  True, False])

# Basic Feature Engineering 

In [13]:
train = train[['keyword', 'text', 'target']]
test = test[['id', 'keyword', 'text']]

train['char_count'] = train['text'].apply(lambda x: count_chars(x))
train['word_count'] = train['text'].apply(lambda x: count_words(x))
train['sent_count'] = train['text'].apply(lambda x: count_sent(x))
train['capital_char_count'] = train['text'].apply(lambda x: count_capital_chars(x))
train['capital_word_count'] = train['text'].apply(lambda x: count_capital_words(x))
# train['quoted_word_count'] = train['text'].apply(lambda x: count_words_in_quotes(x))
train['stopword_count'] = train['text'].apply(lambda x: count_stopwords(x))
train['unique_word_count'] = train['text'].apply(lambda x: count_unique_words(x))
                                                 
test['char_count'] = test['text'].apply(lambda x: count_chars(x))
test['word_count'] = test['text'].apply(lambda x: count_words(x))
test['sent_count'] = test['text'].apply(lambda x: count_sent(x))
test['capital_char_count'] = test['text'].apply(lambda x: count_capital_chars(x))
test['capital_word_count'] = test['text'].apply(lambda x: count_capital_words(x))
# test['quoted_word_count'] = test['text'].apply(lambda x: count_words_in_quotes(x))
test['stopword_count'] = test['text'].apply(lambda x: count_stopwords(x))
test['unique_word_count'] = test['text'].apply(lambda x: count_unique_words(x))
                                                 
## Average word length
train['avg_wordlength'] = train['char_count'] / train['word_count']
test['avg_wordlength'] = test['char_count'] / test['word_count']

## Average sentence lenght
train['avg_sentlength'] = train['word_count'] / train['sent_count']
test['avg_sentlength'] = test['word_count'] / test['sent_count']

## Unique words vs count words
train['unique_vs_words'] = train['unique_word_count'] / train['word_count']
test['unique_vs_words'] = test['unique_word_count'] / test['word_count']

## stopwords vs count words
train['stopwords_vs_words'] = train['stopword_count'] / train['word_count']
test['stopwords_vs_words'] = test['stopword_count'] / test['word_count']

# Baseline Model: Logistic Regression

In [18]:
## Defining input and target variables 
X = train.drop(columns = ['keyword', 'text', 'target'], axis = 1)
Y = train['target']

## Defining the hyper-parameter grid
logistic_param_grid = {'penalty': ['l1', 'l2', 'elasticnet'],
                       'C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'solver': ['liblinear', 'sag', 'saga']}

## Performing grid search with 5 folds
logistic_grid_search = GridSearchCV(LogisticRegression(), logistic_param_grid, cv = 5, scoring = 'f1', n_jobs = -1, verbose = 3).fit(X, Y)

## Printing the best hyper-parameter combination
print(logistic_grid_search.best_params_)

## Printing the best score
print(logistic_grid_search.best_score_)

## Extraciting the best model 
logit_md = logistic_grid_search.best_estimator_
print(logit_md)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


120 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l

{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.5307221931561943
LogisticRegression(C=100, penalty='l1', solver='liblinear')


In [20]:
def precision_recall_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    precision, recall, thresholds = precision_recall_curve(Y_test, Y_pred)
    
    ## Creating the precision-recall data-frame
    precision_recall = pd.DataFrame({'precision': precision[:-1], 'recall': recall[:-1], 'cutoff': thresholds})
    
    ## Finding the optimal cutoff (closest to precision = 1, recall = 1)
    precision_recall['1_minus_precision'] = 1 - precision_recall['precision']
    precision_recall['1_minus_recall'] = 1 - precision_recall['recall']
    precision_recall['Distance_to_perfect_model'] = np.sqrt(precision_recall['1_minus_precision']**2 + precision_recall['1_minus_recall']**2)
    
    ## Sorting based on Distance 
    precision_recall = precision_recall.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)
        
    return precision_recall['cutoff'][0]


logit_pred = logit_md.predict_proba(X)[:, 1]
opt_cutoff = precision_recall_cutoff(Y, logit_pred)
print('The optimal cutoff is', opt_cutoff)

## Predicting on test 
logit_test_pred = logit_md.predict_proba(test.drop(columns = ['id', 'keyword', 'text'], axis = 1))[:, 1]

## Changing likelihoods to labels
logit_test_label = np.where(logit_test_pred < opt_cutoff, 0, 1)

## Creating data-frame for submission
data_out = pd.DataFrame({'id': test['id'], 'target': logit_test_label})
data_out.to_csv('Logistic_submission_1.csv', index = False)

data_out['target'].value_counts() / data_out.shape[0]

The optimal cutoff is 0.3354156955232394


1    0.675759
0    0.324241
Name: target, dtype: float64

# Baseline Model: Random Forest

In [42]:
## Defining input and target variables 
X = train.drop(columns = ['keyword', 'text', 'target'], axis = 1)
Y = train['target']

RF_param_grid = {'n_estimators': [100, 300, 500],
                 'max_features': [3, 4, 5],
                 'max_depth': [3, 5, 7],
                 'min_samples_split': [5, 7, 9],
                 'min_samples_leaf': [5, 7, 9]}
        
## Running leave-one-out cross validation 
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 5, scoring = 'f1', n_jobs = -1, verbose = 3).fit(X, Y)

## Printing the best hyper-parameter combination
print(RF_grid_search.best_params_)

## Extraciting the best model 
RF_md = RF_grid_search.best_estimator_

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV 2/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=5, n_estimators=300;, score=0.551 total time=   1.9s
[CV 1/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=100;, score=0.497 total time=   0.5s
[CV 4/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=100;, score=0.597 total time=   0.5s
[CV 2/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=300;, score=0.549 total time=   1.5s
[CV 4/5] END max_depth=3, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=500;, score=0.591 total time=   3.1s
[CV 1/5] END max_depth=3, max_features=3, min_samples_leaf=7, min_samples_split=5, n_estimators=100;, score=0.501 total time=   0.5s
[CV 3/5] END max_depth=3, max_features=3, min_samples_leaf=7, min_samples_split=5, n_estimators=100;, score=0.575 total time=   0.5s
[CV 1

In [45]:
## Printing the best hyper-parameter combination
print(RF_grid_search.best_params_)

## Printing best F1-score
print(RF_grid_search.best_score_)

{'max_depth': 7, 'max_features': 5, 'min_samples_leaf': 7, 'min_samples_split': 9, 'n_estimators': 100}
0.5777967006001864


In [50]:
## Extraciting the best model 
RF_md = RF_grid_search.best_estimator_

## Predicting on train to estimate cutoff based on precision-recall curve
RF_pred = RF_md.predict_proba(X)[:, 1]
RF_pred

array([0.31321896, 0.18545234, 0.41631674, ..., 0.49013988, 0.46691624,
       0.64951801])

# Playing with seed 

In [117]:
results_train = []
results_test = []

for i in range(0, 100):
    
    ## Building the model 
    RF = RandomForestClassifier(n_estimators = 100, max_depth = 7, max_features = 5, min_samples_leaf = 7, min_samples_split = 9, random_state = i).fit(X, Y)
    
    ## Predicting 
    results_train.append(pd.Series(RF.predict_proba(X)[:, 1]))
    results_test.append(pd.Series(RF.predict_proba(test.drop(columns = ['id', 'keyword', 'text'], axis = 1))[:, 1]))    

In [118]:
results_train = pd.concat(results_train, axis = 1)
RF_pred = results_train.apply('mean', axis = 1)

results_test = pd.concat(results_test, axis = 1)
RF_test_pred = results_test.apply('mean', axis = 1)

In [88]:
# RF_md = RandomForestClassifier(n_estimators = 100, max_depth = 7, max_features = 5, min_samples_leaf = 7, min_samples_split = 9, random_state = 100).fit(X, Y)

# RF_pred = RF_md.predict_proba(X)[:, 1]

In [119]:
import pandas as pd 
import numpy as np
from sklearn.metrics import precision_recall_curve

'''
The precision_recall_cutoff function takes 2 arguments:
Y_test: actual labels from the test data-frame
Y_pred: predicted likelihoods from a model 
This function estimated the optimal cutoff value based 
on the precision_recall_curve function and returns the 
predicted labels based onn the optimal cutoff value from 
the precision_recall_curve function.
'''

def precision_recall_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    precision, recall, thresholds = precision_recall_curve(Y_test, Y_pred)
    
    ## Creating the precision-recall data-frame
    precision_recall = pd.DataFrame({'precision': precision[:-1], 'recall': recall[:-1], 'cutoff': thresholds})
    
    ## Finding the optimal cutoff (closest to precision = 1, recall = 1)
    precision_recall['1_minus_precision'] = 1 - precision_recall['precision']
    precision_recall['1_minus_recall'] = 1 - precision_recall['recall']
    precision_recall['Distance_to_perfect_model'] = np.sqrt(precision_recall['1_minus_precision']**2 + precision_recall['1_minus_recall']**2)
    
    ## Sorting based on Distance 
    precision_recall = precision_recall.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)
        
    return precision_recall['cutoff'][0]

In [120]:
opt_cutoff = precision_recall_cutoff(Y, RF_pred)
opt_cutoff

0.3527323101436399

In [123]:
## Predicting on test 
# RF_test_pred = RF_md.predict_proba(test.drop(columns = ['id', 'keyword', 'text'], axis = 1))[:, 1]

## Changing likelihoods to labels
RF_test_label = np.where(RF_test_pred < opt_cutoff, 0, 1)

## Creating data-frame for submission
data_out = pd.DataFrame({'id': test['id'], 'target': RF_test_label})
data_out.to_csv('RF_submission_4.csv', index = False)

# data_out['target'].value_counts() / data_out.shape[0] (best proportion)
# 1    0.54643
# 0    0.45357

In [57]:
data_out['target'].value_counts() / data_out.shape[0]

1    0.54643
0    0.45357
Name: target, dtype: float64

In [62]:
data_out['target'].value_counts() / data_out.shape[0]

1    0.577383
0    0.422617
Name: target, dtype: float64

In [67]:
data_out['target'].value_counts() / data_out.shape[0]

1    0.631627
0    0.368373
Name: target, dtype: float64

In [122]:
data_out['target'].value_counts() / data_out.shape[0]

1    0.628563
0    0.371437
Name: target, dtype: float64