In [5]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [6]:
#load training dataset
data = open('pickles/dataset.pickle', 'rb')
train_set = pickle.load(data)
X_train = train_set['changed_tweets']
y_train = train_set['label_code']

#load testing dataset
test = open('pickles/test_set.pickle', 'rb')
test_set = pickle.load(test)
X_test = test_set['tweetText']
y_test = test_set['label_code']

In [136]:
from sklearn.pipeline import Pipeline

tweet_clf = Pipeline([
     ('tfidf', TfidfVectorizer(ngram_range=(1,1))),
     ('clf', MultinomialNB(alpha=0.5)),
])

In [138]:
tweet_clf.fit(X_train, y_train)
training_acc = accuracy_score(y_train, tweet_clf.predict(X_train))
testing_acc = accuracy_score(y_test, tweet_clf.predict(X_test))

print("Training acc: " + str(training_acc) + " && Testing acc: " + str(testing_acc))

Training acc: 0.9289172761924385 && Testing acc: 0.8527296937416777


In [139]:
print(classification_report(y_test, tweet_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.59      0.72      1209
           1       0.83      0.98      0.90      2546

    accuracy                           0.85      3755
   macro avg       0.88      0.78      0.81      3755
weighted avg       0.86      0.85      0.84      3755



In [126]:
from sklearn.model_selection import GridSearchCV

table = {}
parameters = {
     'tfidf__max_df': np.arange(0.0,1.1, .1).tolist(),
     'tfidf__min_df': np.arange(0.0,1.1, .1).tolist(),
     'clf__alpha': np.arange(0.0,1.1, .1).tolist()
}

gs = GridSearchCV(tweet_clf, parameters, cv=5, n_jobs=-1)
gs = gs.fit(X_train, y_train)

training_acc = accuracy_score(y_train, gs.predict(X_train))
testing_acc = accuracy_score(y_test, gs.predict(X_test))

print("Training acc: " + str(training_acc) + " && Testing acc: " + str(testing_acc))

Training acc: 0.9269191779559488 && Testing acc: 0.7643142476697736


In [127]:
print(gs.best_score_)
print(gs.best_params_)


0.8119052784187032
{'clf__alpha': 0.5, 'tfidf__max_df': 0.1, 'tfidf__min_df': 0.0}
