In [1]:
import pandas as pd
import glob
import sys
from time import sleep
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
import numpy
import scipy
import sklearn.datasets
import csv

from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import time

## Load Data

In [2]:
train_path="train"
test_path="test"
def load_train(class_name):
    label=0
    if class_name is "pos":
        label=1
    data=[]
    for file in glob.glob(train_path+"/"+class_name+"/*.txt"):
        f = open(file, "r")
        data.append([f.read(),label])
    return data
def load_test():
    data=[]
    for i in range(0,25000):
        file=test_path+"/"+str(i)+".txt"
        f = open(file, "r")
        data.append([f.read(),i])
    return data

In [3]:
train_data=load_train("pos")+load_train("neg")

In [4]:
test_data=load_test()

In [5]:
def cleanUp(text, custom_stopwords=[]):
    # Initilaise Lemmatizer object:
    lemm = WordNetLemmatizer()
    
    # Load NLTK stopwords:
    my_stopwords = stopwords.words('english') + custom_stopwords
    
    clean_text = ''
    
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text) # word_tokenize() takes care of stripping too.
    
    for word in words:
        w = lemm.lemmatize(word.lower())
#         w=word.lower()
#         print(w)
#         if w not in my_stopwords and len(w)>2:
        clean_text += w + " "
    
    return clean_text

In [6]:
def cleanUp_data(dataset):
    for i in range(0,len(dataset)):
        comment=cleanUp(dataset[i][0])
#         comment=dataset[i][0].lower()
#         dataset[i][0]=comment.split()
        dataset[i][0]=comment

In [7]:
# cleanUp_data(train_data)

In [8]:
train_data[0]

['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.',
 1]

## Data Split

In [9]:
train_set,valid_set=sklearn.model_selection.train_test_split(train_data,train_size=0.8,test_size=0.2,shuffle=True)

In [10]:
train_x=[comment[0] for comment in train_set]
train_y=[comment[1] for comment in train_set]
valid_x=[comment[0] for comment in valid_set]
valid_y=[comment[1] for comment in valid_set]

## Feature Extraction: Tf-idf & Binary Occurrence

In [11]:
tfidf_vectorizer=CountVectorizer(ngram_range=(1,2))
binary_vectorizer=CountVectorizer(ngram_range=(1,2),binary=True)

## Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
def LogReg(x_train,y_train,x_valid,y_valid,vectorizer,mode):
    X=vectorizer.fit_transform(x_train)
    if mode is 'tfidf':
        tfidf_transformer = TfidfTransformer(use_idf=False)
        X=tfidf_transformer.fit_transform(X)
    X_valid=vectorizer.transform(x_valid)
    lg=LogisticRegression(C=25.0)
    start=time.time()
    lg.fit(X,y_train)
    valid_pred=lg.predict(X_valid)
    print("LR runtime: ", time.time()-start)
    train_pred=lg.predict(X)
    print("LR train: ", sklearn.metrics.accuracy_score(y_train, train_pred))
    print ("LR Accuracy: ", sklearn.metrics.accuracy_score(y_valid, valid_pred))
#     print (sklearn.metrics.confusion_matrix(y_valid, valid_pred, labels=[1, 0]))

## Linear SVM

In [19]:
from sklearn.svm import LinearSVC
def SVM(x_train,y_train,x_valid,y_valid,vectorizer,mode,C=25):
    X=vectorizer.fit_transform(x_train)
    if mode is 'tfidf':
        tfidf_transformer = TfidfTransformer(use_idf=False)
        X=tfidf_transformer.fit_transform(X)
    X_valid=vectorizer.transform(x_valid)
    clf=LinearSVC(penalty='l2',dual=True, random_state=0, tol=1e-8, C=C) # dual optimization: n_samples<n_features
    start=time.time()
    clf.fit(X,y_train)
    y_pred=clf.predict(X_valid)
    print("SVM runtime: ", time.time()-start)
    train_pred=clf.predict(X)
    print("SVM train: ", sklearn.metrics.accuracy_score(y_train, train_pred))
    print ("SVM Accuracy: ", sklearn.metrics.accuracy_score(y_valid, y_pred))
#     print (sklearn.metrics.confusion_matrix(y_test, y_pred, labels=[1, 0]))

## Test for TF-IDF

In [17]:
print("tfidf: \n")
LogReg(train_x,train_y,valid_x,valid_y,tfidf_vectorizer,'tfidf')
print("\n")
SVM(train_x,train_y,valid_x,valid_y,tfidf_vectorizer,'tfidf')
print("\n")
# naiveBys(train_x,train_y,valid_x,valid_y,tfidf_vectorizer,'tfidf')
# print("\n")

tfidf: 

LR runtime:  18.403308868408203
LR train:  0.997
LR Accuracy:  0.8938


SVM runtime:  33.03994083404541
SVM train:  1.0
SVM Accuracy:  0.8952




## Test for Binary Occurrence

In [18]:
print("binary occurrence: \n")
LogReg(train_x,train_y,valid_x,valid_y,binary_vectorizer,'binary')
print("\n")
SVM(train_x,train_y,valid_x,valid_y,binary_vectorizer,'binary')
print("\n")
# naiveBys(train_x,train_y,valid_x,valid_y,binary_vectorizer,'binary')
# print("\n")

binary occurrence: 

LR runtime:  18.782140970230103
LR train:  1.0
LR Accuracy:  0.897


SVM runtime:  30.687812089920044
SVM train:  1.0
SVM Accuracy:  0.892




## Predict Test Set

In [21]:
cleanUp_data(train_data)
cleanUp_data(test_data)

In [22]:
total_train_x=[comment[0] for comment in train_data]
total_train_y=[comment[1] for comment in train_data]
test_x=[comment[0] for comment in test_data]
test_id=[comment[1] for comment in test_data]
vectorizer = CountVectorizer(ngram_range=(1, 2))
tfidf_transformer = TfidfTransformer(use_idf=False)
train_X=vectorizer.fit_transform(total_train_x)
train_X=tfidf_transformer.fit_transform(train_X)

In [23]:
from sklearn.svm import LinearSVC
clf=LinearSVC(penalty='l2',dual=True, random_state=0, tol=1e-8, C=100)
clf.fit(train_X,total_train_y)
test_X=vectorizer.transform(test_x)
y_pred=clf.predict(test_X)

In [24]:
df=pd.DataFrame(data={'Id':test_id,'Category':y_pred},columns=['Id','Category'])

In [25]:
df.to_csv('svm_1-2gram_c100.csv')

## Experiment for C and n-gram Values

In [20]:
i=1e-3
vectorizer_1=CountVectorizer(ngram_range=(1,1))
vectorizer_2=CountVectorizer(ngram_range=(2,2))
vectorizer_3=CountVectorizer(ngram_range=(3,3))
vectorizer_4=CountVectorizer(ngram_range=(1,2))
vectorizer_5=CountVectorizer(ngram_range=(1,3))
while i<=1e3:
    print("C=",i)
    print("unigram: ")
    SVM(train_x,train_y,valid_x,valid_y,vectorizer_1,'tfidf',i)
    print("bigram: ")
    SVM(train_x,train_y,valid_x,valid_y,vectorizer_2,'tfidf',i)
    print("trigram: ")
    SVM(train_x,train_y,valid_x,valid_y,vectorizer_3,'tfidf',i)
    print("unigram+bigram")
    SVM(train_x,train_y,valid_x,valid_y,vectorizer_4,'tfidf',i)
    print("unigram+bigram+trigram")
    SVM(train_x,train_y,valid_x,valid_y,vectorizer_5,'tfidf',i)
    i=i*10

C= 0.001
unigram: 
SVM runtime:  0.21471309661865234
SVM train:  0.70145
SVM Accuracy:  0.6998
bigram: 
SVM runtime:  0.7077751159667969
SVM train:  0.7056
SVM Accuracy:  0.6916
trigram: 
SVM runtime:  0.6712040901184082
SVM train:  0.8579
SVM Accuracy:  0.7332
unigram+bigram
SVM runtime:  0.8202559947967529
SVM train:  0.70405
SVM Accuracy:  0.7016
unigram+bigram+trigram
SVM runtime:  2.0368740558624268
SVM train:  0.7016
SVM Accuracy:  0.6946
C= 0.01
unigram: 
SVM runtime:  0.2434697151184082
SVM train:  0.7884
SVM Accuracy:  0.7742
bigram: 
SVM runtime:  0.8285892009735107
SVM train:  0.8043
SVM Accuracy:  0.7778
trigram: 
SVM runtime:  1.0861780643463135
SVM train:  0.8907
SVM Accuracy:  0.749
unigram+bigram
SVM runtime:  1.010408878326416
SVM train:  0.7966
SVM Accuracy:  0.7856
unigram+bigram+trigram
SVM runtime:  2.660511016845703
SVM train:  0.7922
SVM Accuracy:  0.78
C= 0.1
unigram: 
SVM runtime:  0.30602526664733887
SVM train:  0.87765
SVM Accuracy:  0.8598
bigram: 
SVM runti