In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from datetime import datetime

In [2]:
raw_data = pd.read_csv("../data/train.csv")
sub_data = pd.read_csv("../data/test.csv")

In [3]:
raw_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
raw_data.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [5]:
raw_data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

## Tokenize and Create BOW

In [6]:
class MyTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.token = RegexpTokenizer(r'\w+')
    def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in self.token.tokenize(doc)]

In [7]:
def preprocessing(text_data, target_data, sub_data, method, ngram_range = (1,1), min_df = 0.001, max_df = 0.999, 
                  test_size = 0.2, stemming = True, vector_size = 20, lda = False):
    
    print(f"Start preprocessing the data with method = {method} and LDA = {lda}.")
    
    if method == "tfidf":
        if stemming:
            vectorizer = TfidfVectorizer(tokenizer = MyTokenizer(), ngram_range = ngram_range, min_df = min_df, max_df = max_df)
        else:
            vectorizer = TfidfVectorizer(ngram_range = ngram_range, min_df = min_df, max_df = max_df)

        x_data = vectorizer.fit_transform(text_data).toarray()

        x_sub = vectorizer.transform(sub_data).toarray()
    
    if method == "d2v":
        tokenizer = RegexpTokenizer(r'\w+')
        docs = []
        for i,doc in enumerate(text_data):
            docs.append(TaggedDocument([t for t in tokenizer.tokenize(doc)], [i]))    
        sub_docs = []
        for doc in sub_data:
            sub_docs.append([t for t in tokenizer.tokenize(doc)])
            
        d2v_model = Doc2Vec(docs, vector_size=vector_size, window=2, min_count=1, workers=4)
        
        x_data = np.array([d2v_model.docvecs[i] for i in range(0,len(text_data))])  ## split data first, then train d2v?
        x_sub = np.array([d2v_model.infer_vector(doc) for doc in sub_docs])
        
    else:
        print("No valid method selected.")
        
    if lda:
        lda_model = LatentDirichletAllocation(n_components = 10)
        x_data = lda_model.fit_transform(x_data)
        x_sub = lda_model.transform(x_sub)
    
    print(f"Shape of BOW: {x_data.shape} and of submission BOW: {x_sub.shape}")
    
    ## Split Data
    x_train, x_test, y_train, y_test = train_test_split(x_data, target_data, test_size = test_size)
    
    return x_train, x_test, y_train, y_test, x_sub

In [8]:
x_train, x_test, y_train, y_test, x_sub = preprocessing(
    raw_data["text"], raw_data["target"], sub_data["text"], method = "tfidf")

Start preprocessing the data with method = tfidf and LDA = False.
No valid method selected.
Shape of BOW: (7613, 1849) and of submission BOW: (3263, 1849)


## Model

In [9]:
def test_model(model, x_data = x_test, y_data = y_test):
    preds = model.predict(x_data)
    acc = accuracy_score(preds,y_data)
    auc = roc_auc_score(preds, y_data)
    f1 = f1_score(preds, y_data)
    
    print(f"{type(model).__name__} Model has accuracy = {round(acc,3)}, AUC = {round(auc,3)} and F1 = {round(f1,3)}")
    print(f"\tThe target data is distributed with 0: {round(1-np.mean(y_data),3)} and 1: {round(np.mean(y_data),3)}.")

    return preds

### Naive Bayes

In [10]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

GaussianNB()

In [11]:
nb_preds = test_model(nb_model)

GaussianNB Model has accuracy = 0.76, AUC = 0.772 and F1 = 0.675
	The target data is distributed with 0: 0.571 and 1: 0.429.


### Logistic Regression

In [12]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

LogisticRegression()

In [13]:
lr_preds = test_model(lr_model)

LogisticRegression Model has accuracy = 0.808, AUC = 0.814 and F1 = 0.754
	The target data is distributed with 0: 0.571 and 1: 0.429.


### Random Forest

In [14]:
rf_model = RandomForestClassifier(max_depth = 30)
rf_model.fit(x_train, y_train)

RandomForestClassifier(max_depth=30)

In [15]:
rf_preds = test_model(rf_model)

RandomForestClassifier Model has accuracy = 0.755, AUC = 0.774 and F1 = 0.659
	The target data is distributed with 0: 0.571 and 1: 0.429.


### SVM

In [16]:
svm_model = SVC()
svm_model.fit(x_train, y_train)

SVC()

In [17]:
svm_preds = test_model(svm_model)

SVC Model has accuracy = 0.801, AUC = 0.814 and F1 = 0.737
	The target data is distributed with 0: 0.571 and 1: 0.429.


### GridSearch & CV

In [25]:
parameters = {'C':[1, 3]}
model = SVC()

clf = GridSearchCV(model, parameters, cv = 2)
clf.fit(x_train, y_train)

print(f"Best CV Score of {round(clf.best_score_,3)} with params:\n{clf.best_params_}")

Best CV Score of 0.801 with params:
{'C': 1}


In [27]:
svc_model = SVC(**clf.best_params_)
svc_model.fit(x_train, y_train)

svc_preds = test_model(svc_model)

SVC Model has accuracy = 0.814, AUC = 0.82 and F1 = 0.749
	The target data is distributed with 0: 0.589 and 1: 0.411.


## Submission Preds

In [18]:
def sub_preds(model, raw_sub_data = sub_data, x_sub = x_sub):
    preds = model.predict(x_sub)
    ids = raw_sub_data["id"].values
    sub_df = pd.DataFrame({"id": ids, "target": preds})
    sub_df.to_csv(f"../submission/{datetime.today().date()}.csv", index = False)

In [19]:
sub_preds(svm_model)