In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.naive_bayes import MultinomialNB

from catboost import CatBoost
from catboost import Pool
import nltk
import numpy as np

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [3]:
train.head(10), test.head(10)

(   id keyword  ...                                               text target
 0   1     NaN  ...  Our Deeds are the Reason of this #earthquake M...      1
 1   4     NaN  ...             Forest fire near La Ronge Sask. Canada      1
 2   5     NaN  ...  All residents asked to 'shelter in place' are ...      1
 3   6     NaN  ...  13,000 people receive #wildfires evacuation or...      1
 4   7     NaN  ...  Just got sent this photo from Ruby #Alaska as ...      1
 5   8     NaN  ...  #RockyFire Update => California Hwy. 20 closed...      1
 6  10     NaN  ...  #flood #disaster Heavy rain causes flash flood...      1
 7  13     NaN  ...  I'm on top of the hill and I can see a fire in...      1
 8  14     NaN  ...  There's an emergency evacuation happening now ...      1
 9  15     NaN  ...  I'm afraid that the tornado is coming to our a...      1
 
 [10 rows x 5 columns],
    id keyword location                                               text
 0   0     NaN      NaN                 J

In [4]:
print(train.isnull().sum(), train.shape, test.isnull().sum(), test.shape, sep='\n\n')

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

(7613, 5)

id             0
keyword       26
location    1105
text           0
dtype: int64

(3263, 4)


Text preprocessing

In [6]:
def preprocess(df):
    import re


    def lemmatize(sentence):
        return ' '.join(map(nltk.stem.WordNetLemmatizer().lemmatize, 
                            sentence.split()))

    def stem(sentence):
        return ' '.join(map(nltk.stem.LancasterStemmer().stem, 
                            sentence.split()))

    def sub(sentence):
        sentence = re.sub(r"\d+", "", sentence)
        sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence)
        sentence = re.sub(r"<.*?>", "", sentence)
        sentence = re.sub(r"["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", "", sentence)
        
        sentence = re.sub(r",", "", sentence)
        sentence = re.sub(r"#", "", sentence)
        
        return sentence

    df['text_raw'] = df.text.to_numpy()
    df['text']     = df.text.apply(sub).apply(stem).apply(lemmatize)
    return df

train = preprocess(train)
test  = preprocess(test)

In [7]:
train.loc[:10, ['text', 'text_raw']]

Unnamed: 0,text,text_raw
0,our dee ar the reason of thi earthquak may all...,Our Deeds are the Reason of this #earthquake M...
1,forest fir near la rong sask. canad,Forest fire near La Ronge Sask. Canada
2,al resid ask to 'shelter in place' ar being no...,All residents asked to 'shelter in place' are ...
3,peopl receiv wildfir evacu ord in californ,"13,000 people receive #wildfires evacuation or..."
4,just got sent thi photo from ruby alask a smok...,Just got sent this photo from Ruby #Alaska as ...
5,rockyfir upd => californ hwy. clos in both dir...,#RockyFire Update => California Hwy. 20 closed...
6,flood disast heavy rain caus flash flood of st...,#flood #disaster Heavy rain causes flash flood...
7,i'm on top of the hil and i can see a fir in t...,I'm on top of the hill and I can see a fire in...
8,there's an emerg evacu hap now in the build ac...,There's an emergency evacuation happening now ...
9,i'm afraid that the tornado is com to our area...,I'm afraid that the tornado is coming to our a...


TF-IDF vectorizer

In [8]:
tfidf = TfidfVectorizer(stop_words='english', 
                        token_pattern=r'\b[a-zA-Z]{3,}\b',
                        ngram_range=(1, 1))
tfidf.fit(train.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='\\b[a-zA-Z]{3,}\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Models for testing

In [9]:
def get_model(model_name, iterations=100_000):
    models = {
        'bayes'   :  MultinomialNB(),
        'log_reg' :  LogisticRegression(
            max_iter = iterations,
            solver = 'sag',
            fit_intercept = False,
            penalty = 'l2',
            dual = False,
            verbose = 0)
    }

    return models[model_name]

In [14]:
def fold(X, tfidf_vectorizer, model_name='bayes', iterations=100_000, k=5):
    y = X.target
    aucs, f1s, models = [], [], []
    for train_idx, test_idx in KFold(k, shuffle=True).split(X):
        X_train, X__test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y__test = y.iloc[train_idx], y.iloc[test_idx]

        model = get_model(model_name)

        train_vectors = tfidf_vectorizer.transform(X_train.text)
        test__vectors = tfidf_vectorizer.transform(X__test.text)

        model.fit(train_vectors, y_train)

        y_score = model.predict(test__vectors)

        auc = roc_auc_score(y__test, y_score)
        f1  = f1_score(y__test, y_score)

        aucs.append(auc)
        f1s.append(f1)
        models.append(model)
        print("""
ROC AUC: {}
F1     : {}
""".format(auc, f1))
        
    print("""


Mean ROC AUC: {}
Mean F1     : {}
""".format(np.mean(aucs), np.mean(f1s)))
    
    return models


models = fold(train, tfidf, model_name='log_reg', k=10, iterations=500_000)    


ROC AUC: 0.7790074295041369
F1     : 0.7436708860759494


ROC AUC: 0.7741101918676287
F1     : 0.7354838709677419


ROC AUC: 0.7663778235228664
F1     : 0.7298578199052131


ROC AUC: 0.8026039907688843
F1     : 0.7711999999999999


ROC AUC: 0.773094342251951
F1     : 0.7439024390243903


ROC AUC: 0.7927028934368383
F1     : 0.7569331158238173


ROC AUC: 0.7998837547224644
F1     : 0.7554806070826308


ROC AUC: 0.7755642269563384
F1     : 0.7417840375586854


ROC AUC: 0.742943803184767
F1     : 0.7032967032967032


ROC AUC: 0.7719116583213947
F1     : 0.730831973898858




Mean ROC AUC: 0.777820011453727
Mean F1     : 0.741244145363399



Kaggle submission

In [32]:
test__vectors = tfidf.transform(test.text)
train_vectors = tfidf.transform(train.text)

model = get_model('log_reg', iterations=300_000)
model.fit(train_vectors, train.target)
y_scores = model.predict(test__vectors)

In [34]:
kaggle_frame = pd.DataFrame({ 'id' : test.id, 'target' : y_scores })
kaggle_frame.target.value_counts()

0    2012
1    1251
Name: target, dtype: int64

In [36]:
kaggle_frame.to_csv('submission.csv', index=False)