In [83]:
import os
import pickle
import numpy as np
import pandas
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [84]:
def load_data(fileName):
    dataSet = []
    with open(fileName, 'r', encoding='utf-8') as file:
        dataSet = [(line.strip())[:-1] for line in file.readlines()]
    return dataSet

In [85]:
def load_label(fileName):
    labels = []
    with open(fileName, 'r', encoding='utf-8') as file:
        labels = [int(line) for line in file.readlines()]
    return labels

In [86]:
dataSet = load_data('textData.txt')
labels = load_label('textLable.txt')

In [87]:
df = pandas.DataFrame()
df['text'] = dataSet
df['label'] = labels

In [104]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(df.text, df.label)

In [99]:
# count vec
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df.text)
# transform the training and test data
xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)

In [105]:
# tf-idf
# tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', max_features=5000)
tfidf_vect.fit(df.text)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

In [101]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df.text)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

In [102]:
def train_model(classifier, feature_train, label_train, feature_test):
    # fit the training dataset on the classifier
    classifier.fit(feature_train, label_train)
    # predict the labels on test set
    predictions = classifier.predict(feature_test)
    return metrics.accuracy_score(predictions, test_y)

In [106]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.6153846153846154
