In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

In [55]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [56]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


# Preprocessing

## Tokenization

In [57]:
from nltk.tokenize import word_tokenize

In [58]:
def tokenize(df):
    df['tokenized_text'] =  df['text'].apply(word_tokenize)

In [59]:
tokenize(train)
tokenize(test)

In [60]:
train['tokenized_text']

0       [Our, Deeds, are, the, Reason, of, this, #, ea...
1        [Forest, fire, near, La, Ronge, Sask, ., Canada]
2       [All, residents, asked, to, 'shelter, in, plac...
3       [13,000, people, receive, #, wildfires, evacua...
4       [Just, got, sent, this, photo, from, Ruby, #, ...
                              ...                        
7608    [Two, giant, cranes, holding, a, bridge, colla...
7609    [@, aria_ahrary, @, TheTawniest, The, out, of,...
7610    [M1.94, [, 01:04, UTC, ], ?, 5km, S, of, Volca...
7611    [Police, investigating, after, an, e-bike, col...
7612    [The, Latest, :, More, Homes, Razed, by, North...
Name: tokenized_text, Length: 7613, dtype: object

## Removing punctuation 

In [61]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [62]:
def remove_punctuation(df):
    df['only_text'] = df['tokenized_text'].apply(
        lambda row: [word for word in row if word not in punctuation]
    )

In [63]:
remove_punctuation(train)
remove_punctuation(test)

In [64]:
train['only_text'] 

0       [Our, Deeds, are, the, Reason, of, this, earth...
1           [Forest, fire, near, La, Ronge, Sask, Canada]
2       [All, residents, asked, to, 'shelter, in, plac...
3       [13,000, people, receive, wildfires, evacuatio...
4       [Just, got, sent, this, photo, from, Ruby, Ala...
                              ...                        
7608    [Two, giant, cranes, holding, a, bridge, colla...
7609    [aria_ahrary, TheTawniest, The, out, of, contr...
7610    [M1.94, 01:04, UTC, 5km, S, of, Volcano, Hawai...
7611    [Police, investigating, after, an, e-bike, col...
7612    [The, Latest, More, Homes, Razed, by, Northern...
Name: only_text, Length: 7613, dtype: object

## Removing stopwords

In [65]:
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

In [66]:
def remove_stopwords(df):
    df['cleaned_text'] = df['only_text'].apply(
        lambda row: [word for word in row if word not in stop_words]
    ) 

In [67]:
remove_stopwords(train)
remove_stopwords(test)

In [68]:
train['cleaned_text']

0       [Our, Deeds, Reason, earthquake, May, ALLAH, F...
1           [Forest, fire, near, La, Ronge, Sask, Canada]
2       [All, residents, asked, 'shelter, place, notif...
3       [13,000, people, receive, wildfires, evacuatio...
4       [Just, got, sent, photo, Ruby, Alaska, smoke, ...
                              ...                        
7608    [Two, giant, cranes, holding, bridge, collapse...
7609    [aria_ahrary, TheTawniest, The, control, wild,...
7610    [M1.94, 01:04, UTC, 5km, S, Volcano, Hawaii, h...
7611    [Police, investigating, e-bike, collided, car,...
7612    [The, Latest, More, Homes, Razed, Northern, Ca...
Name: cleaned_text, Length: 7613, dtype: object

# Extracting more information from the tweets

In [69]:
train

Unnamed: 0,id,keyword,location,text,target,tokenized_text,only_text,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #, ea...","[Our, Deeds, are, the, Reason, of, this, earth...","[Our, Deeds, Reason, earthquake, May, ALLAH, F..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac...","[All, residents, asked, to, 'shelter, in, plac...","[All, residents, asked, 'shelter, place, notif..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfires, evacua...","[13,000, people, receive, wildfires, evacuatio...","[13,000, people, receive, wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #, ...","[Just, got, sent, this, photo, from, Ruby, Ala...","[Just, got, sent, photo, Ruby, Alaska, smoke, ..."
...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,"[Two, giant, cranes, holding, a, bridge, colla...","[Two, giant, cranes, holding, a, bridge, colla...","[Two, giant, cranes, holding, bridge, collapse..."
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,"[@, aria_ahrary, @, TheTawniest, The, out, of,...","[aria_ahrary, TheTawniest, The, out, of, contr...","[aria_ahrary, TheTawniest, The, control, wild,..."
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[M1.94, [, 01:04, UTC, ], ?, 5km, S, of, Volca...","[M1.94, 01:04, UTC, 5km, S, of, Volcano, Hawai...","[M1.94, 01:04, UTC, 5km, S, Volcano, Hawaii, h..."
7611,10872,,,Police investigating after an e-bike collided ...,1,"[Police, investigating, after, an, e-bike, col...","[Police, investigating, after, an, e-bike, col...","[Police, investigating, e-bike, collided, car,..."


In [70]:
def count_hashtags(df):
    df['#_amount'] = df['tokenized_text'].apply(lambda row: len([word for word in row if word=='#']))

In [71]:
def count_mentions(df):
    df['@_amount'] = df['tokenized_text'].apply(lambda row: len([word for word in row if word=='@']))

In [72]:
def count_punctuation(df):
    df['punctuation_amount'] = df['tokenized_text'].apply(
        lambda row: len([word for word in row if word in punctuation])
    )

In [73]:
def count_stopwords(df):
    df['stopword_amount'] = df['tokenized_text'].apply(
        lambda row: len([word for word in row if word in stop_words])
    )

In [74]:
def count_urls(df):
    df['url_amount'] = df['tokenized_text'].apply(
        lambda row: len([word for word in row if 'http' in word or 'https' in word])
    )

In [75]:
def mean_word_length(df):
    df['mean_word_length'] = df['only_text'].apply(
        lambda row: round(np.mean([len(word) for word in row]), 3)
    )

In [76]:
def word_amount(df):
    df['word_amount'] = df['only_text'].apply(
        lambda row: len(row)
    )

In [77]:
def unique_word_amount(df):
    df['unique_word_amount'] = df['only_text'].apply(
        lambda row: len(list(set(row)))
    )

In [78]:
def has_location(df):
    df['has_location']  = df['location'].str.len()>0

In [79]:
def uppercase_percentage(text):
    return(sum(1 for c in text if c.isupper())/len(text))

def count_uppercase_percentage(df):
    df['uppercase_percentage'] = df['text'].apply(uppercase_percentage)

In [84]:
def extract_data(df):
    count_hashtags(df)
    count_mentions(df)
    count_punctuation(df)
    count_stopwords(df)
    count_urls(df)
    mean_word_length(df)
    word_amount(df)
    unique_word_amount(df)
    has_location(df)
    count_uppercase_percentage(df)

In [85]:
extract_data(train)
extract_data(test)

# Keyword analysis

In [177]:
# we look for the most common keywords in the tweets
top_target_keywords = train[train.target==1].keyword.value_counts().head(100)
top_non_target_keywords = train[train.target==0].keyword.value_counts().head(100)

words_1 = set(list(top_target_keywords.index))
words_0 = set(list(top_non_target_keywords.index))
intersection = list(words_1.intersection(words_0)) # identification of ambigual keywords

top_target_keywords = top_target_keywords.drop(intersection)
top_non_target_keywords = top_non_target_keywords.drop(intersection)

97

In [196]:
type(train['keyword'])

pandas.core.series.Series

In [204]:
def analyze_keywords(df):
    df['target_keyword'] = df['keyword'].apply(lambda keyword: keyword in top_target_keywords)
    df['non_target_keyword'] = df['keyword'].apply(lambda keyword: keyword in top_non_target_keywords)

In [205]:
analyze_keywords(train)
analyze_keywords(test)

# Sentiment Analysis

In [87]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [88]:
def calc_compound_sentiment(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)['compound']

def sentiment(df):
    df['sentiment'] = df['text'].apply(calc_compound_sentiment)

In [89]:
sentiment(train)
sentiment(test)

In [40]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(train.sample(frac=0.1), title="Twitter Profiling Report", explorative=True)
# profile.to_file("twitter.html")

# Random Forest

In [207]:
train

Unnamed: 0,id,keyword,location,text,target,tokenized_text,only_text,cleaned_text,#_amount,@_amount,...,stopword_amount,url_amount,mean_word_length,word_amount,unique_word_amount,has_location,uppercase_percentage,sentiment,target_keyword,non_target_keyword
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #, ea...","[Our, Deeds, are, the, Reason, of, this, earth...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...",1,0,...,5,0,4.31,13,13,False,0.14,0.27,False,False
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]",0,0,...,0,0,4.43,7,7,False,0.13,-0.34,False,False
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac...","[All, residents, asked, to, 'shelter, in, plac...","[All, residents, asked, 'shelter, place, notif...",0,0,...,9,0,5.00,22,19,False,0.02,-0.30,False,False
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfires, evacua...","[13,000, people, receive, wildfires, evacuatio...","[13,000, people, receive, wildfires, evacuatio...",1,0,...,1,0,7.00,8,8,False,0.02,0.00,False,False
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #, ...","[Just, got, sent, this, photo, from, Ruby, Ala...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...",2,0,...,6,0,4.38,16,15,False,0.03,0.00,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,"[Two, giant, cranes, holding, a, bridge, colla...","[Two, giant, cranes, holding, a, bridge, colla...","[Two, giant, cranes, holding, bridge, collapse...",0,0,...,2,1,6.00,12,12,False,0.08,-0.49,False,False
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,"[@, aria_ahrary, @, TheTawniest, The, out, of,...","[aria_ahrary, TheTawniest, The, out, of, contr...","[aria_ahrary, TheTawniest, The, control, wild,...",0,2,...,7,0,5.10,20,17,False,0.05,-0.58,False,False
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[M1.94, [, 01:04, UTC, ], ?, 5km, S, of, Volca...","[M1.94, 01:04, UTC, 5km, S, of, Volcano, Hawai...","[M1.94, 01:04, UTC, 5km, S, Volcano, Hawaii, h...",0,0,...,1,1,5.30,10,10,False,0.15,0.00,False,False
7611,10872,,,Police investigating after an e-bike collided ...,1,"[Police, investigating, after, an, e-bike, col...","[Police, investigating, after, an, e-bike, col...","[Police, investigating, e-bike, collided, car,...",0,0,...,5,0,6.16,19,19,False,0.03,-0.78,False,False


In [268]:
y_train = train['target']
X_train = train.drop(['tokenized_text', 'only_text', 'cleaned_text',
                      'keyword', 'location', 'text', 'id', 'target'], axis=1)
X_test = test.drop(['tokenized_text', 'only_text', 'cleaned_text',
                    'keyword', 'location', 'text', 'id'], axis=1)

In [209]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=120)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [210]:
indices = list(test['id'])
y_pred = pd.DataFrame ({'id':indices,
                        'target':y_pred})

In [211]:
y_pred.to_csv('rf_prediction.csv', index=False)b

# Feature importance - method I

In [218]:
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   X_train.columns,
                                   columns=['Importance'])

In [222]:
feature_importances.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance
mean_word_length,0.14
uppercase_percentage,0.14
sentiment,0.13
target_keyword,0.13
non_target_keyword,0.08
unique_word_amount,0.07
word_amount,0.07
stopword_amount,0.06
punctuation_amount,0.06
url_amount,0.03


# Feature importance - method II

In [223]:
from sklearn.feature_selection import SelectKBest

In [226]:
bestfeatures = SelectKBest(k=10)

In [227]:
fit = bestfeatures.fit(X_train,y_train)

In [232]:
featureScores = pd.DataFrame(fit.scores_,
                             X_train.columns,
                             columns=['Importance_Score']).sort_values(by='Importance_Score', ascending=False)

In [233]:
featureScores

Unnamed: 0,Importance_Score
target_keyword,1634.89
non_target_keyword,1427.69
sentiment,300.72
mean_word_length,263.21
url_amount,237.76
@_amount,53.5
stopword_amount,43.05
unique_word_amount,29.17
#_amount,18.18
word_amount,17.78


# Random forest attempt with top 5 relevant features

In [270]:
# we will build new model with the most relevant columns
relevant_features = list(featureScores.head(5).index)
X_train_2 = X_train[relevant_features]
X_test_2 = X_test[relevant_features]

In [271]:
rf_model_2 = RandomForestClassifier(n_estimators=120)
rf_model_2.fit(X_train_2, y_train)
y_pred_2 = rf_model_2.predict(X_test_2)

In [272]:
indices = list(test['id'])
y_pred_2 = pd.DataFrame ({'id':indices,
                        'target':y_pred_2})

In [273]:
y_pred_2.to_csv('rf_prediction_2.csv', index=False)
# outcome worse than with all the features

# Lazypredict

In [212]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.25, random_state=123)

In [214]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

In [215]:
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:08<00:00,  3.23it/s]


In [216]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.71,0.69,0.69,0.7,0.65
RandomForestClassifier,0.71,0.69,0.69,0.7,0.74
LGBMClassifier,0.7,0.68,0.68,0.69,0.08
XGBClassifier,0.69,0.68,0.68,0.69,0.16
SVC,0.7,0.67,0.67,0.69,1.23
BaggingClassifier,0.68,0.66,0.66,0.68,0.19
NuSVC,0.67,0.66,0.66,0.67,2.7
AdaBoostClassifier,0.67,0.65,0.65,0.67,0.18
KNeighborsClassifier,0.67,0.65,0.65,0.66,0.08
GaussianNB,0.65,0.65,0.65,0.65,0.01


# XGBoost 

In [282]:
y_train = train['target']
X_train = train.drop(['tokenized_text', 'only_text', 'cleaned_text',
                      'keyword', 'location', 'text', 'id', 'target'], axis=1)
X_test = test.drop(['tokenized_text', 'only_text', 'cleaned_text',
                    'keyword', 'location', 'text', 'id'], axis=1)

In [283]:
from xgboost import XGBClassifier

In [294]:
gb_model= XGBClassifier()
gb_model.fit(X_train, y_train)

In [295]:
y_pred_xgboost = gb_model.predict(X_test)

In [296]:
indices = list(test['id'])
y_pred_xgboost = pd.DataFrame ({'id':indices,
                        'target':y_pred_xgboost})

In [297]:
y_pred_xgboost.to_csv('xgb_prediction.csv', index=False)

# Further analysis

In [140]:
top_target_keywords = train[train.target==1].keyword.value_counts().head(100)
top_target_keywords

derailment     39
wreckage       39
outbreak       39
debris         37
oil%20spill    37
               ..
crushed         4
screamed        4
obliterate      4
drown           3
bloody          3
Name: keyword, Length: 200, dtype: int64

In [141]:
top_non_target_keywords = train[train.target==0].keyword.value_counts().head(100)
top_non_target_keywords

body%20bags          40
harm                 37
armageddon           37
wrecked              36
ruin                 36
                     ..
bush%20fires          7
casualties            7
drought               7
bridge%20collapse     6
mass%20murder         5
Name: keyword, Length: 200, dtype: int64