In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from scipy.sparse import hstack

In [3]:
train_df = pd.read_csv('data/train.csv').fillna('')
test_df = pd.read_csv('data/test.csv').fillna('')
train_df.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [None]:
train_df.info()

In [None]:
train_df.head(10)

In [None]:
train_df[(train_df['location'].isna()) & (train_df['target'] == 1)].count()

In [None]:
train_df['raw_tokens'] = train_df['text'].apply(nltk.word_tokenize)
train_df['raw_token_count'] = train_df['raw_tokens'].apply(lambda tokens: len(tokens))

In [None]:
train_df.head()

In [None]:
train_df['processed_text'] = train_df['text'].apply(lambda text: text.lower())
train_df['processed_text'] = train_df['processed_text'].apply(lambda text: re.sub(r'http\S+', ' ', text))
train_df['processed_text'] = train_df['processed_text'].apply(lambda text: re.sub(r'[!@#$\']', '', text))

In [None]:
train_df.head()

In [None]:
test_df['raw_tokens'] = test_df['text'].apply(nltk.word_tokenize)
test_df['raw_token_count'] = test_df['raw_tokens'].apply(lambda tokens: len(tokens))

In [None]:
test_df.head()

In [None]:
test_df['processed_text'] = test_df['text'].apply(lambda text: text.lower())
test_df['processed_text'] = test_df['text'].apply(lambda text: re.sub(r'http\S+', ' ', text))
test_df['processed_text'] = test_df['text'].apply(lambda text: re.sub(r'[!@#$\']', '', text))

In [None]:
test_df.head()

In [5]:
train_text, test_text = train_df['text'], test_df['text']
texts = pd.concat([train_text, test_text])
# train_text_p, test_text_p = train_df['processed_text'], test_df['processed_text']
# processed_texts = pd.concat([train_text_p, test_text_p])

In [6]:
%%time
word_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=5000
)

word_vec.fit(texts)

train_word_features = word_vec.transform(train_text)
test_word_features = word_vec.transform(test_text)

CPU times: total: 312 ms
Wall time: 306 ms


In [None]:
%%time
word_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=5000
)

word_vec.fit(processed_texts)

train_word_features_p = word_vec.transform(train_text)
test_word_features_p = word_vec.transform(test_text)

In [7]:
%%time
character_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    token_pattern=r'\w{1,}',
    ngram_range=(2, 8),
    max_features=50000
)

character_vec.fit(texts)

train_characters_features = character_vec.transform(train_text)
test_characters_features = character_vec.transform(test_text)



CPU times: total: 15.2 s
Wall time: 15.7 s


In [None]:
%%time
character_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    token_pattern=r'\w{1,}',
    ngram_range=(2, 8),
    max_features=50000
)

character_vec.fit(processed_texts)

train_characters_features_p = character_vec.transform(train_text_p)
test_characters_features_p = character_vec.transform(test_text_p)

In [8]:
train_features = hstack([train_characters_features, train_word_features])
test_features = hstack([test_characters_features, test_word_features])

In [None]:
train_features_p = hstack([train_characters_features_p, train_word_features_p])
test_features_p = hstack([test_characters_features_p, test_word_features_p])

In [None]:
train_word_features.toarray()

In [None]:
train_word_features_p.toarray()

In [9]:
train_oof = np.zeros(train_df.shape[0],)
kf = KFold(random_state=127, shuffle=True)
test_predictions = 0

for jj, (train_index, val_index) in enumerate(kf.split(train_features)):
    print("Fitting 5 folds, current: ", jj+1)
    train_x = train_features.toarray()[train_index]
    val_x = train_features.toarray()[val_index]
    train_target = train_df['target'].values[train_index]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(train_x, train_target)
    train_oof[val_index] = classifier.predict_proba(val_x)[:,1]
    test_predictions += classifier.predict_proba(test_features)[:1] / 5

print(roc_auc_score(train_df['target'], train_oof))

Fitting 5 folds, current:  1
Fitting 5 folds, current:  2
Fitting 5 folds, current:  3
Fitting 5 folds, current:  4
Fitting 5 folds, current:  5
0.8720802873710753


In [None]:
train_oof_p = np.zeros(train_df.shape[0],)
kf = KFold(random_state=127, shuffle=True)
test_predictions_p = 0

for jj, (train_index, val_index) in enumerate(kf.split(train_features_p)):
    print("Fitting 5 folds, current: ", jj+1)
    train_x = train_features_p.toarray()[train_index]
    val_x = train_features_p.toarray()[val_index]
    train_target = train_df['target'].values[train_index]
    classifier = LogisticRegression(solver='sag')
    classifier.fit(train_x, train_target)
    train_oof[val_index] = classifier.predict_proba(val_x)[:,1]
    test_predictions += classifier.predict_proba(test_features)[:1] / 5

print(roc_auc_score(train_df['target'], train_oof))

In [10]:
test_predictions

array([[0.33307407, 0.66692593]])