In [237]:
from sklearn.linear_model import PassiveAggressiveClassifier
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [238]:
#load training dataset
data = open('pickles/dataset.pickle', 'rb')
train_set = pickle.load(data)
X_train = train_set['changed_tweets']
y_train = train_set['label_code']

#load testing dataset
test = open('pickles/test_set.pickle', 'rb')
test_set = pickle.load(test)
X_test = test_set['tweetText']
y_test = test_set['label_code']

In [239]:
from sklearn.pipeline import Pipeline

tweet_clf = Pipeline([
     ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_df=0.5, min_df=3)),
     ('clf', PassiveAggressiveClassifier(max_iter=50)),
])

In [262]:
tweet_clf.fit(X_train, y_train)
training_acc = accuracy_score(y_train, tweet_clf.predict(X_train))
testing_acc = accuracy_score(y_test, tweet_clf.predict(X_test))

print("Training acc: " + str(training_acc) + " && Testing acc: " + str(testing_acc))

Training acc: 0.9593077160811785 && Testing acc: 0.8521970705725699


In [263]:
print(classification_report(y_test, tweet_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.66      0.74      1209
           1       0.85      0.94      0.90      2546

    accuracy                           0.85      3755
   macro avg       0.85      0.80      0.82      3755
weighted avg       0.85      0.85      0.85      3755



# Parameter tuning using GridSearchCV

In [83]:
from sklearn.model_selection import GridSearchCV

table = list(range(50, 1001))
parameters = {
     'tfidf__max_df': np.arange(0.0,1.1, .1).tolist(),
     'tfidf__min_df': list(range(21)),
     'clf__max_iter': [58,1,12,11,48,31,1000]
}

gs = GridSearchCV(tweet_clf, parameters, cv=5, n_jobs=-1)
gs = gs.fit(X_train, y_train)

training_acc = accuracy_score(y_train, gs.predict(X_train))
testing_acc = accuracy_score(y_test, gs.predict(X_test))

print("Training acc: " + str(training_acc) + " && Testing acc: " + str(testing_acc))



Training acc: 0.9370564924891716 && Testing acc: 0.7179760319573901


In [84]:
print(gs.best_score_)
print(gs.best_params_)


0.7905232144714471
{'clf__max_iter': 1, 'tfidf__max_df': 0.2, 'tfidf__min_df': 1}


In [None]:
pac=PassiveAggressiveClassifier()
pac.fit(features_train, labels_train)
    
training_acc = accuracy_score(labels_train, pac.predict(features_train))
testing_acc = accuracy_score(labels_test, pac.predict(features_test))

print(training_acc)
print(testing_acc)

In [21]:
max_iter = list(range(11,51))
list_n = {}
for n in max_iter:
    pac=PassiveAggressiveClassifier(max_iter=n)
    pac.fit(features_train, labels_train)
    
    training_acc = accuracy_score(labels_train, pac.predict(features_train))
    testing_acc = accuracy_score(labels_test, pac.predict(features_test))
    
    list_n[n] = (training_acc, testing_acc)
   



In [22]:
list_n

{11: (0.988848954013455, 0.7097203728362184),
 12: (0.9895862132522348, 0.6460719041278296),
 13: (0.9894940558473874, 0.7163781624500666),
 14: (0.9882960095843701, 0.6745672436750999),
 15: (0.9897705280619298, 0.6681757656458056),
 16: (0.9894940558473874, 0.6572569906790945),
 17: (0.9897705280619298, 0.6591211717709721),
 18: (0.9896783706570823, 0.6492676431424766),
 19: (0.9899548428716247, 0.6529960053262317),
 20: (0.9899548428716247, 0.6591211717709721),
 21: (0.9893097410376924, 0.7163781624500666),
 22: (0.9898626854667772, 0.6958721704394141),
 23: (0.9893097410376924, 0.6511318242343542),
 24: (0.9894018984425399, 0.6716378162450066),
 25: (0.9889411114183024, 0.6503328894806925),
 26: (0.9894018984425399, 0.6631158455392809),
 27: (0.9898626854667772, 0.6719041278295605),
 28: (0.9899548428716247, 0.6503328894806925),
 29: (0.9899548428716247, 0.6649800266311584),
 30: (0.9879273799649801, 0.6681757656458056),
 31: (0.9890332688231499, 0.8007989347536618),
 32: (0.987282

PassiveAggresiveClassifier 
max_iter = 1000 -> acc=0.8117177 ngram=(1,1)
max_iter = 48 -> acc = 0.80932   ngram=(1,2)
max_iter = 31 -> acc = 0.80079   ...

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_iter': list(range(51,1000))}

In [9]:
print("The training accuracy is: ")
print(accuracy_score(labels_train, pac.predict(features_train)))

The training accuracy is: 
0.9894018984425399


In [10]:
pac_pred = pac.predict(features_test)
print("The test accuracy is: ")
print(accuracy_score(labels_test, pac_pred))

The test accuracy is: 
0.7094540612516644
