In [6]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as esw
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [16]:
class wordSpace:
    
    def __init__(self, rootPath):
        """define the whole word space"""
        self.stopWords = ["a","the","i","he","is","am","are","was","were","of","for","they","she","it"] + list(esw) 
        self.data = {}
        self.vocabSize = 0
        self.dataNum = 0
        self.tranData, self.testData = self.loadData(rootPath)
        print("the wordSpace has been created!")

    def loadData(self, root):
        
        for folder in ['train','test']:
            self.data[folder] = []
            for subf in ['pos','neg']:
                score = 1 if subf == 'pos' else 0
                path = os.path.join(root, folder, subf)
                files = os.listdir(path)
                print('loading %s' % path)
                for file in files:
                    with open(os.path.join(path, file), "r",encoding = 'ISO-8859-1') as f:
                        content = f.read()
                        self.data[folder].append([content, score])
        np.random.shuffle(self.data['train'])
        self.data["train"] = pd.DataFrame(self.data["train"],
                                          columns=['text', 'sentiment'])

        np.random.shuffle(self.data['test'])
        self.data["test"] = pd.DataFrame(self.data["test"],
                                    columns=['text', 'sentiment'])
        return self.data["train"], self.data["test"]
    def vectorize(self, model = None):
        print('start vectorize')
        self.vectorize = CountVectorizer(stop_words = self.stopWords)
        self.training_features = self.vectorize.fit_transform(self.tranData["text"])
        self.test_features = self.vectorize.transform(self.testData["text"])
        print ('vectorize complete!')
    def predictSVM(self):
        # Training
        model = LinearSVC()
        model.fit(self.training_features, self.tranData["sentiment"])
        y_pred = model.predict(self.test_features)

        # Evaluation
        acc = accuracy_score(self.testData["sentiment"], y_pred)
        print("Accuracy on the IMDB dataset using SVM: {:.2f}".format(acc * 100))
    def predictLogistic(self):
        log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
        model = log_reg.fit(self.training_features, self.tranData["sentiment"])
        y_pred = model.predict(self.test_features)
        # Evaluation
        acc = accuracy_score(self.testData["sentiment"], y_pred)
        print("\nAccuracy on the IMDB dataset using Logistic Regression: {:.2f}".format(acc * 100))

In [17]:
words = wordSpace('./')

loading ./train/pos
loading ./train/neg
loading ./test/pos
loading ./test/neg
the wordSpace has been created!


In [18]:
trainDataX = words.data['train']['text']
trainDataY = words.data['train']['sentiment']
testDataX = words.data['test']['text']
testDataY = words.data['test']['sentiment']

words.vectorize()

start vectorize
vectorize complete!


In [19]:
words.predictLogistic()
words.predictSVM()

[LibLinear]
Accuracy on the IMDB dataset using Logistic Regression: 83.84
Accuracy on the IMDB dataset using SVM: 83.64
