In [30]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from scipy.sparse import hstack

In [3]:
train_df = pd.read_csv('data/train.csv').fillna('')
test_df = pd.read_csv('data/test.csv').fillna('')
train_df.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7613 non-null   object
 2   location  7613 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [6]:
train_df[(train_df['location'].isna()) & (train_df['target'] == 1)].count()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [7]:
train_df['raw_tokens'] = train_df['text'].apply(nltk.word_tokenize)
train_df['raw_token_count'] = train_df['raw_tokens'].apply(lambda tokens: len(tokens))

In [8]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,raw_tokens,raw_token_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #, ea...",14
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]",8
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac...",24
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfires, evacua...",9
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #, ...",18


In [41]:
train_df['processed_text'] = train_df['text'].apply(lambda text: text.lower())
train_df['processed_text'] = train_df['processed_text'].apply(lambda text: re.sub(r'http\S+', ' ', text))
train_df['processed_text'] = train_df['processed_text'].apply(lambda text: re.sub(r'[!@#$\']', '', text))

In [40]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,raw_tokens,raw_token_count,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #, ea...",14,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]",8,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac...",24,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfires, evacua...",9,"13,000 people receive wildfires evacuation ord..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #, ...",18,just got sent this photo from ruby alaska as s...


In [10]:
test_df['raw_tokens'] = test_df['text'].apply(nltk.word_tokenize)
test_df['raw_token_count'] = test_df['raw_tokens'].apply(lambda tokens: len(tokens))

In [11]:
test_df.head()

Unnamed: 0,id,keyword,location,text,raw_tokens,raw_token_count
0,0,,,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]",6
1,2,,,"Heard about #earthquake is different cities, s...","[Heard, about, #, earthquake, is, different, c...",12
2,3,,,"there is a forest fire at spot pond, geese are...","[there, is, a, forest, fire, at, spot, pond, ,...",22
3,9,,,Apocalypse lighting. #Spokane #wildfires,"[Apocalypse, lighting, ., #, Spokane, #, wildf...",7
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...",8


In [42]:
test_df['processed_text'] = test_df['text'].apply(lambda text: text.lower())
test_df['processed_text'] = test_df['text'].apply(lambda text: re.sub(r'http\S+', ' ', text))
test_df['processed_text'] = test_df['text'].apply(lambda text: re.sub(r'[!@#$\']', '', text))

In [43]:
test_df.head()

Unnamed: 0,id,keyword,location,text,raw_tokens,raw_token_count,processed_text
0,0,,,Just happened a terrible car crash,"[Just, happened, a, terrible, car, crash]",6,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","[Heard, about, #, earthquake, is, different, c...",12,"Heard about earthquake is different cities, st..."
2,3,,,"there is a forest fire at spot pond, geese are...","[there, is, a, forest, fire, at, spot, pond, ,...",22,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,"[Apocalypse, lighting, ., #, Spokane, #, wildf...",7,Apocalypse lighting. Spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[Typhoon, Soudelor, kills, 28, in, China, and,...",8,Typhoon Soudelor kills 28 in China and Taiwan


In [45]:
train_text, test_text = train_df['text'], test_df['text']
texts = pd.concat([train_text, test_text])
train_text_p, test_text_p = train_df['processed_text'], test_df['processed_text']
processed_texts = pd.concat([train_text_p, test_text_p])

In [25]:
%%time
word_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=5000
)

word_vec.fit(texts)

train_word_features = word_vec.transform(train_text)
test_word_features = word_vec.transform(test_text)

CPU times: total: 391 ms
Wall time: 401 ms


In [46]:
%%time
word_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=5000
)

word_vec.fit(processed_texts)

train_word_features_p = word_vec.transform(train_text)
test_word_features_p = word_vec.transform(test_text)

CPU times: total: 312 ms
Wall time: 392 ms


In [27]:
%%time
character_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    token_pattern=r'\w{1,}',
    ngram_range=(2, 8),
    max_features=50000
)

character_vec.fit(texts)

train_characters_features = character_vec.transform(train_text)
test_characters_features = character_vec.transform(test_text)



CPU times: total: 16.8 s
Wall time: 17.4 s


In [47]:
%%time
character_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    token_pattern=r'\w{1,}',
    ngram_range=(2, 8),
    max_features=50000
)

character_vec.fit(processed_texts)

train_characters_features_p = character_vec.transform(train_text_p)
test_characters_features_p = character_vec.transform(test_text_p)



CPU times: total: 15 s
Wall time: 15.4 s


In [28]:
train_features = hstack([train_characters_features, train_word_features])
test_features = hstack([test_characters_features, test_word_features])

In [48]:
train_features_p = hstack([train_characters_features_p, train_word_features_p])
test_features_p = hstack([test_characters_features_p, test_word_features_p])

In [29]:
train_word_features.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [49]:
train_word_features_p.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
train_oof = np.zeros(train_df.shape[0],)
kf = KFold(random_state=127, shuffle=True)
test_predictions = 0

for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
    print("Fitting 5 folds, current: ", jj+1)
    train_x = train_features.toarray()[train_index]
    val_x = train_features.toarray()[val_index]
    train_target = train_df['target'].values[train_index]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(train_x, train_target)
    train_oof[val_index] = classifier.predict_proba(val_x)[:,1]
    test_predictions += classifier.predict_proba(test_features)[:1] / 5

print(roc_auc_score(train_df['target'], train_oof))

Fitting 5 folds, current:  1
Fitting 5 folds, current:  2
Fitting 5 folds, current:  3
Fitting 5 folds, current:  4
Fitting 5 folds, current:  5
0.8720820476019953


In [50]:
train_oof_p = np.zeros(train_df.shape[0],)
kf = KFold(random_state=127, shuffle=True)
test_predictions_p = 0

for jj, (train_index, val_index) in enumerate(kf.split(train_features_p)):
    print("Fitting 5 folds, current: ", jj+1)
    train_x = train_features_p.toarray()[train_index]
    val_x = train_features_p.toarray()[val_index]
    train_target = train_df['target'].values[train_index]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(train_x, train_target)
    train_oof[val_index] = classifier.predict_proba(val_x)[:,1]
    test_predictions += classifier.predict_proba(test_features)[:1] / 5

print(roc_auc_score(train_df['target'], train_oof))

Fitting 5 folds, current:  1
Fitting 5 folds, current:  2
Fitting 5 folds, current:  3
Fitting 5 folds, current:  4
Fitting 5 folds, current:  5
0.8729501934916235


In [32]:
test_predictions

array([[0.33307779, 0.66692221]])