In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
import pandas as pd

uni_freq_data = pd.read_pickle("processed_data/unigram_frequency")
uni_pres_data = pd.read_pickle("processed_data/unigram_pres")
uni_bi_data = pd.read_pickle("processed_data/uni_bi_pres")
bi_pres_data = pd.read_pickle("processed_data/bi_pres")
POS_pres_data = pd.read_pickle("processed_data/POS_pres")
adj_pres_data = pd.read_pickle("processed_data/adj_pres")
top_uni_pres_data = pd.read_pickle("processed_data/top_uni_pres")
uni_pos_pres_data = pd.read_pickle("processed_data/uni_pos_pres")
classes = pd.read_pickle("processed_data/classes")

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import numpy as np

def KfoldAccs(data, classes):
  kf = KFold(3, random_state=0, shuffle=True)
  models = [MultinomialNB(), SVC(), LogisticRegression()]
  accs = []
  for model in models:
    acc = []
    # print("\tTraining ...", end=" ")
    for train, test in kf.split(data, classes):
      model.fit(data.iloc[train], classes.iloc[train])
      acc.append(model.score(data.iloc[test], classes.iloc[test]))
    accs.append(f"{np.mean(acc)*100:.2f}")
  # print()
  return accs

print("Features\t#of features\tNaive Bayes\tSVC\tLogistic Regression")
print(f"unigram freqs\t{len(uni_freq_data.columns)}\t\t" + "\t\t".join(KfoldAccs(uni_freq_data, classes)))
print(f"unigram pres\t{len(uni_pres_data.columns)}\t\t" + "\t\t".join(KfoldAccs(uni_pres_data, classes)))
print(f"uni+bigrams\t{len(uni_bi_data.columns)}\t\t" + "\t\t".join(KfoldAccs(uni_bi_data, classes)))
print(f"bigrams pres\t{len(bi_pres_data.columns)}\t\t" + "\t\t".join(KfoldAccs(bi_pres_data, classes)))
print(f"POS Tags\t{len(POS_pres_data.columns)}\t\t" + "\t\t".join(KfoldAccs(POS_pres_data, classes)))
print(f"adjectives\t{len(adj_pres_data.columns)}\t\t" + "\t\t".join(KfoldAccs(adj_pres_data, classes)))
print(f"top unigrams\t{len(top_uni_pres_data.columns)}\t\t" + "\t\t".join(KfoldAccs(top_uni_pres_data, classes)))
print(f"uni positions\t{len(uni_pos_pres_data.columns)}\t\t" + "\t\t".join(KfoldAccs(uni_pos_pres_data, classes)))

Features	#of features	Naive Bayes	SVC	Logistic Regression
unigram freqs	15521		79.86		70.29		82.07
unigram pres	15521		82.14		83.21		84.79
uni+bigrams	31042		83.14		81.71		85.14
bigrams pres	15521		81.07		75.50		78.14
POS Tags	17380		81.50		82.36		85.00
adjectives	3065		78.29		76.71		77.50
top unigrams	3065		82.36		83.64		83.00
uni positions	21744		81.71		76.93		79.50
