In [15]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as esw
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from glove.src import *
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [107]:
class wordSpace:
    
    def __init__(self, rootPath):
        """define the whole word space"""
        self.stopWords = {'the' ,
                            'a' ,
                            'and' ,
                            'of' ,
                            'this' ,
                            'is' ,
                            'to' ,
                            'i' ,
                            'it' ,
                            'in' ,
                            'was' ,
                            'that' ,
                            'for' ,
                            'but' ,
                            'you' ,
                            'as' ,
                            'with' ,
                            'film' ,
                            'not' ,
                            'have' ,
                            'one' ,
                            '/><br' ,
                            'on' ,
                            'be' ,
                            'are' ,
                            "it's"}
        self.data = {}
        self.vocabSize = 0
        self.threshold = 1000
        self.tranData, self.testData = self.loadData(rootPath)
        
        print("the wordSpace has been created!")

    def cross_validation(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.data["train"]["text"], 
                                                            self.data["train"]["sentiment"], test_size=0.2, random_state=0)

    def stringTrim(self, content):
        temp = content.split(" ")
        newContent = ""
        for word in temp:
            if word in self.stopWords:
                pass
            else:
                newContent += word + " "
        return newContent
    def loadData(self, root):
        for folder in ['train','test']:
            self.data[folder] = []
            for subf in ['pos','neg']:
                score = 1 if subf == 'pos' else 0
                path = os.path.join(root, folder, subf)
                files = os.listdir(path)
                print('loading %s' % path)
                for file in files:
                    with open(os.path.join(path, file), "r",encoding = 'ISO-8859-1') as f:
                        content = self.stringTrim(f.read())
                        wordCount = len(content.split(" "))
                        if (wordCount < self.threshold):

                            self.data[folder].append([content, score, wordCount])
        np.random.shuffle(self.data['train'])
        self.data["train"] = pd.DataFrame(self.data["train"],
                                          columns=['text', 'sentiment','wordCount'])

        np.random.shuffle(self.data['test'])
        self.data["test"] = pd.DataFrame(self.data["test"],
                                    columns=['text', 'sentiment','wordCount'])
        return self.data["train"], self.data["test"]
    def vectorize(self, model = None):
        print('start vectorize')
        self.vectorize = CountVectorizer(stop_words = self.stopWords)
        
        self.training_features = self.vectorize.fit_transform(self.X_train)
        self.vali_features = self.vectorize.transform(self.X_test)
        
        print (self.training_features.shape)
        print (self.vali_features.shape)
        self.test_features = self.vectorize.transform(self.testData["text"])
        
        print ('vectorize complete!')
    def predictSVM(self):
        # Training
        model = LinearSVC()
        model.fit(self.training_features, self.y_train)
        
        y_trainhat = model.predict(self.training_features)
        train_acc = accuracy_score(self.y_train, y_trainhat)
        print("Accuracy on the IMDB train dataset using SVM: {:.2f}".format(train_acc * 100))
        
        y_validation = model.predict(self.vali_features)
        vali_acc = accuracy_score(self.y_test, y_validation)
        print("Accuracy on the IMDB validation dataset using SVM: {:.2f}".format(vali_acc * 100))
        
        y_pred = model.predict(self.test_features)
        # Evaluation
        acc = accuracy_score(self.testData["sentiment"], y_pred)
        print("Accuracy on the IMDB dataset using SVM: {:.2f}".format(acc * 100))
    def predictLogistic(self):
        log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
        model = log_reg.fit(self.training_features, self.y_train)
        
        y_trainhat = model.predict(self.training_features)
        train_acc = accuracy_score(self.y_train, y_trainhat)
        print("Accuracy on the IMDB train dataset using Logistic: {:.2f}".format(train_acc * 100))
        
        y_validation = model.predict(self.vali_features)
        vali_acc = accuracy_score(self.y_test, y_validation)
        print("Accuracy on the IMDB validation dataset using Logistic: {:.2f}".format(vali_acc * 100))
        
        y_pred = model.predict(self.test_features)
        # Evaluation
        acc = accuracy_score(self.testData["sentiment"], y_pred)
        print("\nAccuracy on the IMDB dataset using Logistic Regression: {:.2f}".format(acc * 100))

In [108]:
words = wordSpace('./../')

loading ./../train/pos
loading ./../train/neg
loading ./../test/pos
loading ./../test/neg
the wordSpace has been created!


In [109]:
words.cross_validation()

In [110]:
words.vectorize()

start vectorize
(1676, 9703)
(420, 9703)


  'stop_words.' % sorted(inconsistent))


vectorize complete!


In [111]:
words.predictLogistic()
words.predictSVM()

[LibLinear]Accuracy on the IMDB train dataset using Logistic: 100.00
Accuracy on the IMDB validation dataset using Logistic: 86.67

Accuracy on the IMDB dataset using Logistic Regression: 84.18
Accuracy on the IMDB train dataset using SVM: 100.00
Accuracy on the IMDB validation dataset using SVM: 86.43
Accuracy on the IMDB dataset using SVM: 83.47
