In [26]:
import numpy as np
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import urllib
np.random.seed(100)

In [27]:
def read_data(path,testing=False):
    data = pd.read_csv(path)
    
    data["keyword"].replace(np.nan,"-",inplace=True)
    data["keyword"] = data["keyword"].apply(lambda x:urllib.parse.unquote(x))
    
    data["location"].replace(np.nan,"-",inplace=True)
    data["location"] = data["location"].apply(lambda x: re.sub("[^-a-zA-Z\s]","",x))
    
    data["text"] = data["text"].apply(lambda x: re.sub("[^\w\s#'_]","",x)).apply(lambda x:x.lower())
    data["text"] = data["text"].apply(lambda x: " ".join([a for a in re.split("([#$])",x) if len(a)!=0]))
    
    new_data = pd.DataFrame()
    new_data["id"] = data["id"]
    new_data["text"] = data["keyword"] + " " + data["location"]+ " " + data["text"]
    if not testing:
        new_data["target"] = data["target"]
    return new_data

In [28]:
train_path = "./data/train.csv"
train_data = read_data(train_path)
train_data.head()

Unnamed: 0,id,text,target
0,1,- - our deeds are the reason of this # earthq...,1
1,4,- - forest fire near la ronge sask canada,1
2,5,- - all residents asked to 'shelter in place' ...,1
3,6,- - 13000 people receive # wildfires evacuati...,1
4,7,- - just got sent this photo from ruby # alas...,1


In [42]:
X_train, X_test, y_train, y_test = train_test_split(train_data["text"], train_data["target"], test_size=0.33, random_state=42)
cv = CountVectorizer()
cv.fit(X_train.values)
X_train = cv.transform(X_train.values).toarray()>=1
X_test = cv.transform(X_test.values).toarray()>=1

In [43]:
model = MultinomialNB(alpha=1)
model.fit(X_train,y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [44]:
res = model.predict(X_test)
print(np.mean(res==y_test))


0.7946677278153601


In [15]:
test_data = read_data("./data/test.csv",testing=True)

In [16]:
X_test = cv.transform(test_data["text"].values).toarray()

In [17]:
res = model.predict(X_test)

In [18]:
test_res = pd.DataFrame()
test_res["id"]= test_data["id"]
test_res["target"] = res

In [23]:
test_res.to_csv(r"./result/res_nb.csv",index=False)