In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, make_scorer
from sklearn import pipeline

In [29]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sample = pd.read_csv("data/sample_submission.csv")

In [30]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [31]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [32]:
sample.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [33]:
train_df.shape, test_df.shape

((7613, 5), (3263, 4))

In [34]:
# Let's drop the keyword and location columns 
# as they probably won't help much in predicting 
# is the tweet it valid or not.

In [35]:
train_df = train_df.drop(["keyword", "location"], axis=1)
test_df = test_df.drop(["keyword", "location"], axis=1)

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_df.drop("target", axis=1), train_df["target"], test_size=0.25, shuffle=True)

In [37]:
print(X_train.shape)
print(X_valid.shape)

(5709, 2)
(1904, 2)


In [38]:
# We'll start with basic model i.e tfidf and logistic regression

In [42]:
tfv = TfidfVectorizer(min_df=1,
                      max_features=None,
                      strip_accents="unicode",
                      analyzer="word",
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1,
                      stop_words="english")

tfv.fit(list(X_train["text"]) + list(X_valid["text"]))
X_train_tfv = tfv.transform(X_train["text"])
X_valid_tfv = tfv.transform(X_valid["text"])

In [52]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_tfv, y_train)
logistic_tfidf = f1_score(y_valid, logistic_model.predict(X_valid_tfv))
print(logistic_tfidf)

0.6428027418126429


In [53]:
ctv = CountVectorizer(analyzer="word",
                      token_pattern=r"\w{1,}",
                      ngram_range=(1, 3),
                      stop_words="english")

ctv.fit(list(X_train["text"]) + list(X_valid["text"]))
X_train_ctv = ctv.transform(X_train["text"])
X_valid_ctv = ctv.transform(X_valid["text"])

In [54]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_ctv, y_train)
logistic_ctv = f1_score(y_valid, logistic_model.predict(X_valid_ctv))
print(logistic_ctv)

0.7216216216216216


In [55]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_tfv, y_train)
naive_bayes_tfv = f1_score(y_valid, naive_bayes_model.predict(X_valid_tfv))
print(naive_bayes_tfv)

0.6911225238444607


In [56]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_ctv, y_train)
naive_bayes_ctv = f1_score(y_valid, naive_bayes_model.predict(X_valid_ctv))
print(naive_bayes_ctv)

0.7455410225921522


In [57]:
xgboost_model = xgb.XGBClassifier()
xgboost_model.fit(X_train_tfv, y_train)
xgboost_tfv = f1_score(y_valid, xgboost_model.predict(X_valid_tfv))
print(xgboost_tfv)

0.697072838665759


In [58]:
xgboost_model = xgb.XGBClassifier()
xgboost_model.fit(X_train_ctv, y_train)
xgboost_ctv = f1_score(y_valid, xgboost_model.predict(X_valid_ctv))
print(xgboost_ctv)

0.7162162162162161


In [60]:
f1_scorer = make_scorer(f1_score)

In [89]:
logistic_model_gridsearch = LogisticRegression(solver="liblinear")

clf = pipeline.Pipeline([('lr', logistic_model_gridsearch)])

params_grid = {'lr__C': np.linspace(0.1, 1, 10), 'lr__penalty': ['l1', 'l2']}

model = GridSearchCV(estimator=clf,
                     param_grid=params_grid,
                     scoring=f1_scorer,
                     refit=True,
                     cv=5)

model.fit(X_train_tfv, y_train)

print("Best score: %0.3f" % model.best_score_)

Best score: 0.608


In [92]:
logistic_model_gridsearch = LogisticRegression(solver="liblinear")

clf = pipeline.Pipeline([('lr', logistic_model_gridsearch)])

params_grid = {'lr__C': np.linspace(0.1, 1, 10), 'lr__penalty': ['l1', 'l2']}

model = GridSearchCV(estimator=clf,
                     param_grid=params_grid,
                     scoring=f1_scorer,
                     refit=True,
                     cv=5)

model.fit(X_train_ctv, y_train)

print("Best score: %0.3f" % model.best_score_)
logistic_grid_ctv = f1_score(y_valid, model.predict(X_valid_ctv))
print(logistic_grid_ctv)

Best score: 0.725
0.7330316742081447


In [104]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=f1_scorer,
                                 verbose=10, n_jobs=-1, refit=True, cv=2)

# Fit Grid Search Model
model.fit(X_train_tfv, y_train)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
nb_grid_tfv = f1_score(y_valid, model.predict(X_valid_tfv))
print(nb_grid_tfv)

#LB: 0.79344

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.726
Best parameters set:
	nb__alpha: 0.1
0.7389221556886227


In [105]:
sample["target"] = model.predict(tfv.transform(test_df["text"]))
sample.to_csv("final_submission.csv", index=False)

In [100]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=f1_scorer,
                                 verbose=10, n_jobs=-1, refit=True, cv=2)

# Fit Grid Search Model
model.fit(X_train_ctv, y_train)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
nb_grid_ctv = f1_score(y_valid, model.predict(X_valid_ctv))
print(nb_grid_ctv)
# LB: 0.79446

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.734
Best parameters set:
	nb__alpha: 1
0.7455410225921522


In [103]:
sample["target"] = model.predict(ctv.transform(test_df["text"]))
sample.to_csv("final_submission.csv", index=False)