In [60]:
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import GridSearchCV

from scipy import sparse

import re

###  preprocessing

In [61]:
# read data and labels
def read_data(review, label, s_size):
    data = open("imdb_review.txt", 'r', encoding = "utf-8")
    label = open("imdb_labels.txt", 'r', encoding = "utf-8")
    
    data = data.readlines()
    label = label.readlines()
    
    data = data[:s_size]
    label = label[:s_size]
    
    return data, label

In [62]:
text_data, label_data = read_data("imdb_review.txt", "imdb_labels.txt", 100)

In [63]:
text_data

['"One of the other reviewers has mentioned that after watching just 1 Oz episode you\'ll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the

In [64]:
label_data

['positive\n',
 'positive\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'positive\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'positive\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'positive\n',
 'positive\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'negative\n',
 'positive\n',
 'negative

In [65]:
# remove html tags, numbers and punctuation
def clean_data(text_data):
    # digits
    num_re = re.compile('\d+')
    # html tags
    html_re = re.compile("<.*?>")
    # punctuation
    pun_re = re.compile('[^\w\s]')
    
    for i in range(len(text_data)):  
        text_data[i] = re.sub(html_re, '', text_data[i])
        text_data[i] = re.sub(num_re, '', text_data[i])
        text_data[i] = re.sub(pun_re, '', text_data[i])
    
    return text_data    

In [66]:
data = clean_data(text_data)

In [67]:
data

['One of the other reviewers has mentioned that after watching just  Oz episode youll be hooked They are right as this is exactly what happened with meThe first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the wordIt is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to manyAryans Muslims gangstas Latinos Christians Italians Irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayI would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare Forget pretty pictur

In [68]:
# split data, strip labels
def split(data, label_data):
    
    temp_data = []
    for corpus in data:
        #temp = [] ??
        temp = corpus.split(' ')
        
        temp_data.append(temp)
    
    
    #temp = [] ??
    temp_label = []
    for i in label_data:
        temp = i.strip('\n')
            
        temp_label.append(temp)
        
    
    return temp_data, temp_label

In [69]:
data, label = split(data, label_data)

In [70]:
data

[['One',
  'of',
  'the',
  'other',
  'reviewers',
  'has',
  'mentioned',
  'that',
  'after',
  'watching',
  'just',
  '',
  'Oz',
  'episode',
  'youll',
  'be',
  'hooked',
  'They',
  'are',
  'right',
  'as',
  'this',
  'is',
  'exactly',
  'what',
  'happened',
  'with',
  'meThe',
  'first',
  'thing',
  'that',
  'struck',
  'me',
  'about',
  'Oz',
  'was',
  'its',
  'brutality',
  'and',
  'unflinching',
  'scenes',
  'of',
  'violence',
  'which',
  'set',
  'in',
  'right',
  'from',
  'the',
  'word',
  'GO',
  'Trust',
  'me',
  'this',
  'is',
  'not',
  'a',
  'show',
  'for',
  'the',
  'faint',
  'hearted',
  'or',
  'timid',
  'This',
  'show',
  'pulls',
  'no',
  'punches',
  'with',
  'regards',
  'to',
  'drugs',
  'sex',
  'or',
  'violence',
  'Its',
  'is',
  'hardcore',
  'in',
  'the',
  'classic',
  'use',
  'of',
  'the',
  'wordIt',
  'is',
  'called',
  'OZ',
  'as',
  'that',
  'is',
  'the',
  'nickname',
  'given',
  'to',
  'the',
  'Oswald',
  

In [71]:
label

['positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',

In [72]:
# convert to lowercase
def lowercase(data):
    data_lowercase = []

    for corpus in data:
        temp_list = []
        for word in corpus:
            temp = word.lower()
            temp_list.append(temp)
        
        data_lowercase.append(temp_list)
        
    return data_lowercase

In [73]:
data = lowercase(data)

In [74]:
data

[['one',
  'of',
  'the',
  'other',
  'reviewers',
  'has',
  'mentioned',
  'that',
  'after',
  'watching',
  'just',
  '',
  'oz',
  'episode',
  'youll',
  'be',
  'hooked',
  'they',
  'are',
  'right',
  'as',
  'this',
  'is',
  'exactly',
  'what',
  'happened',
  'with',
  'methe',
  'first',
  'thing',
  'that',
  'struck',
  'me',
  'about',
  'oz',
  'was',
  'its',
  'brutality',
  'and',
  'unflinching',
  'scenes',
  'of',
  'violence',
  'which',
  'set',
  'in',
  'right',
  'from',
  'the',
  'word',
  'go',
  'trust',
  'me',
  'this',
  'is',
  'not',
  'a',
  'show',
  'for',
  'the',
  'faint',
  'hearted',
  'or',
  'timid',
  'this',
  'show',
  'pulls',
  'no',
  'punches',
  'with',
  'regards',
  'to',
  'drugs',
  'sex',
  'or',
  'violence',
  'its',
  'is',
  'hardcore',
  'in',
  'the',
  'classic',
  'use',
  'of',
  'the',
  'wordit',
  'is',
  'called',
  'oz',
  'as',
  'that',
  'is',
  'the',
  'nickname',
  'given',
  'to',
  'the',
  'oswald',
  

In [75]:
# remove stop words
def remove_stopwords(data):
    res_data = []
    for corpus in data:
        temp_list = []
        for word in corpus:
            if word not in stopwords.words('english'):
                temp_list.append(word)
                
        res_data.append(temp_list)
        
    return res_data

In [76]:
data = remove_stopwords(data)

In [77]:
data

[['one',
  'reviewers',
  'mentioned',
  'watching',
  '',
  'oz',
  'episode',
  'youll',
  'hooked',
  'right',
  'exactly',
  'happened',
  'methe',
  'first',
  'thing',
  'struck',
  'oz',
  'brutality',
  'unflinching',
  'scenes',
  'violence',
  'set',
  'right',
  'word',
  'go',
  'trust',
  'show',
  'faint',
  'hearted',
  'timid',
  'show',
  'pulls',
  'punches',
  'regards',
  'drugs',
  'sex',
  'violence',
  'hardcore',
  'classic',
  'use',
  'wordit',
  'called',
  'oz',
  'nickname',
  'given',
  'oswald',
  'maximum',
  'security',
  'state',
  'penitentary',
  'focuses',
  'mainly',
  'emerald',
  'city',
  'experimental',
  'section',
  'prison',
  'cells',
  'glass',
  'fronts',
  'face',
  'inwards',
  'privacy',
  'high',
  'agenda',
  'em',
  'city',
  'home',
  'manyaryans',
  'muslims',
  'gangstas',
  'latinos',
  'christians',
  'italians',
  'irish',
  'moreso',
  'scuffles',
  'death',
  'stares',
  'dodgy',
  'dealings',
  'shady',
  'agreements',
  'n

In [78]:
# stemming - using porter stemmer algorithm
def stemming(data):
    porter_stemmer = PorterStemmer()
    stem_data = []
    
    for corpus in data:
        temp_list = []
        for word in corpus:
            temp = porter_stemmer.stem(word)
    
            temp_list.append(temp)
        
        stem_data.append(temp_list)
    
    return stem_data

In [79]:
data = stemming(data)

In [80]:
data

[['one',
  'review',
  'mention',
  'watch',
  '',
  'oz',
  'episod',
  'youll',
  'hook',
  'right',
  'exactli',
  'happen',
  'meth',
  'first',
  'thing',
  'struck',
  'oz',
  'brutal',
  'unflinch',
  'scene',
  'violenc',
  'set',
  'right',
  'word',
  'go',
  'trust',
  'show',
  'faint',
  'heart',
  'timid',
  'show',
  'pull',
  'punch',
  'regard',
  'drug',
  'sex',
  'violenc',
  'hardcor',
  'classic',
  'use',
  'wordit',
  'call',
  'oz',
  'nicknam',
  'given',
  'oswald',
  'maximum',
  'secur',
  'state',
  'penitentari',
  'focus',
  'mainli',
  'emerald',
  'citi',
  'experiment',
  'section',
  'prison',
  'cell',
  'glass',
  'front',
  'face',
  'inward',
  'privaci',
  'high',
  'agenda',
  'em',
  'citi',
  'home',
  'manyaryan',
  'muslim',
  'gangsta',
  'latino',
  'christian',
  'italian',
  'irish',
  'moreso',
  'scuffl',
  'death',
  'stare',
  'dodgi',
  'deal',
  'shadi',
  'agreement',
  'never',
  'far',
  'awayi',
  'would',
  'say',
  'main',
 

In [81]:
# part of speech tagging
def pos(data):
    pos_list = []
    
    for corpus in data:
        pos_list.append(nltk.pos_tag(corpus))
            
        
    return pos_list

In [82]:
data = pos(data)

In [83]:
data

[[('one', 'CD'),
  ('review', 'NN'),
  ('mention', 'NN'),
  ('watch', 'NN'),
  ('', 'NNP'),
  ('oz', 'VBZ'),
  ('episod', 'JJ'),
  ('youll', 'NN'),
  ('hook', 'NN'),
  ('right', 'RB'),
  ('exactli', 'VBZ'),
  ('happen', 'VB'),
  ('meth', 'NNS'),
  ('first', 'JJ'),
  ('thing', 'NN'),
  ('struck', 'VBD'),
  ('oz', 'JJ'),
  ('brutal', 'JJ'),
  ('unflinch', 'JJ'),
  ('scene', 'NN'),
  ('violenc', 'NN'),
  ('set', 'VBN'),
  ('right', 'RB'),
  ('word', 'NN'),
  ('go', 'VB'),
  ('trust', 'NN'),
  ('show', 'NN'),
  ('faint', 'JJ'),
  ('heart', 'NN'),
  ('timid', 'NN'),
  ('show', 'NN'),
  ('pull', 'JJ'),
  ('punch', 'JJ'),
  ('regard', 'JJ'),
  ('drug', 'NN'),
  ('sex', 'NN'),
  ('violenc', 'NN'),
  ('hardcor', 'NN'),
  ('classic', 'JJ'),
  ('use', 'NN'),
  ('wordit', 'NN'),
  ('call', 'NN'),
  ('oz', 'NN'),
  ('nicknam', 'JJ'),
  ('given', 'VBN'),
  ('oswald', 'RP'),
  ('maximum', 'JJ'),
  ('secur', 'JJ'),
  ('state', 'NN'),
  ('penitentari', 'NN'),
  ('focus', 'NN'),
  ('mainli', 'NN'),
  ('

In [84]:
# lemmatizing, using wordnet with pos
def lemmatizing(data):
    wnl = WordNetLemmatizer()
    lemmatize_data = []
    
    tag = { "NNS": 'n',
            "NN" : 'n',
            "NNP": 'n',
            "JJ" : 'a',
            "VBD": 'v',
            "VBZ": 'v',
            "VB" : 'v',
            "VBN": 'v',
            "VBP": 'v',
            "RB" : 'r'  }
    
    for corpus in data:
        temp_list = []
        for word in corpus:
            if word[1] in tag.keys():
                
                if ( word[1] == 'NNS' ) or (word[1] == 'NN') or (word[1] == 'NNP'):
                    temp = wnl.lemmatize(word[0], pos = 'n')
                
                elif ( word[1] == 'JJ'):
                    temp=  wnl.lemmatize(word[0], pos = 'a')
                
                elif ( word[1] == 'VBD') or (word[1] == 'VBZ') or (word[1] == 'VB') or (word[1] == 'VBN') or (word[1] == 'VBP'):
                    temp = wnl.lemmatize(word[0], pos = 'v')
                
                elif (word[1] == 'RB'):
                    temp = wnl.lemmatize(word[0], pos = 'r')
            
                
                temp_list.append(temp)

        
            else:
                temp_list.append(word[0])
                
        lemmatize_data.append(temp_list)
        

    return lemmatize_data

In [85]:
data = lemmatizing(data)

In [86]:
data

[['one',
  'review',
  'mention',
  'watch',
  '',
  'oz',
  'episod',
  'youll',
  'hook',
  'right',
  'exactli',
  'happen',
  'meth',
  'first',
  'thing',
  'strike',
  'oz',
  'brutal',
  'unflinch',
  'scene',
  'violenc',
  'set',
  'right',
  'word',
  'go',
  'trust',
  'show',
  'faint',
  'heart',
  'timid',
  'show',
  'pull',
  'punch',
  'regard',
  'drug',
  'sex',
  'violenc',
  'hardcor',
  'classic',
  'use',
  'wordit',
  'call',
  'oz',
  'nicknam',
  'give',
  'oswald',
  'maximum',
  'secur',
  'state',
  'penitentari',
  'focus',
  'mainli',
  'emerald',
  'citi',
  'experiment',
  'section',
  'prison',
  'cell',
  'glass',
  'front',
  'face',
  'inward',
  'privaci',
  'high',
  'agenda',
  'em',
  'citi',
  'home',
  'manyaryan',
  'muslim',
  'gangsta',
  'latino',
  'christian',
  'italian',
  'irish',
  'moreso',
  'scuffl',
  'death',
  'stare',
  'dodgi',
  'deal',
  'shadi',
  'agreement',
  'never',
  'far',
  'awayi',
  'would',
  'say',
  'main',
  

### Bag of words and Tf-idf models

In [87]:
# bag of words
def bow(data):
    BoW_list = []
    word_dict = {}
    
    for corpus in data:
        for word in corpus:
            
            if word not in word_dict:
                word_dict[word] = 1     
            else:
                word_dict[word] += 1
    
    
    for corpus in data:
        temp_vector = []
        for word in word_dict:
            
            if word in corpus:
                temp = corpus.count(word)
                temp_vector.append(temp)
            else:
                temp_vector.append(0)
                
                
        BoW_list.append(temp_vector)
    
    BoW_array = np.array(BoW_list)
    BoW = sparse.csr_matrix(BoW_array)

    return BoW

In [88]:
BoW = bow(data)

In [89]:
BoW.shape

(100, 3912)

In [90]:
# for 100 smaple
BoW_train = BoW[0:50]
BoW_test = BoW[50:100]

In [91]:
BoW_train.shape

(50, 3912)

In [92]:
BoW_test.shape

(50, 3912)

In [93]:
# split train and test data for 50000 corpus - BoW model
# def train_test_BoW(BoW, labels):
#     BoW_train, BoW_test, label_train, label_test = train_test_split(BoW, label2num(label), test_size = 0.3, random_state = 42)
    
#     return BoW_train, BoW_test, label_train, label_test

In [94]:
# tf_idf
def tf_idf(data):
    tfidf_list = []
    for corpus in data:
        temp = " ".join(corpus)
        
        tfidf_list.append(temp)
        
    train_data = tfidf_list[0:50]
    test_data = tfidf_list[50:100]
        
    vectorizer = TfidfVectorizer()
    
    tfidf_train = vectorizer.fit_transform(train_data)
    tfidf_test = vectorizer.transform(test_data)
    
    # for 50000 sample
    #tfidf_train, tfidf_test, label_train, label_test = train_test_split(BoW, label2num(label), test_size = 0.3, random_state = 42)
    
    return tfidf_train, tfidf_test

In [95]:
tfidf_train, tfidf_test = tf_idf(data)

In [96]:
# label to num
def label2num(label):
    label2num = []
    for i in label:
        if i == "positive":
            label2num.append(1)
        else:
            label2num.append(0)
    
    #labels = np.array(label2num(label))
    
    return label2num

In [97]:
labels = np.array(label2num(label))

In [98]:
train_labels = labels[0:50]

In [99]:
test_labels = labels[50:100]

### training with Logistic regression

In [100]:
# logistic regression for BoW
def logisticreg_BoW(BoW_train, BoW_test, train_labels, test_labels):
    lr = LogisticRegression()
    
    LR_BoW = lr.fit(BoW_train, train_labels)
    BoW_pre = LR_BoW.predict(BoW_test)
    BoW_accuracy = accuracy_score(test_labels, BoW_pre)
    
    return BoW_accuracy

In [101]:
BoW_result = logisticreg_BoW(BoW_train, BoW_test, train_labels, test_labels)

In [102]:
BoW_result

0.78

In [103]:
# logistic regression for Tf-idf
def logisticreg_tfidf(tfidf_train, tfidf_test, train_labels, test_labels):
    lr = LogisticRegression()
    
    LR_tfidf = lr.fit(tfidf_train, train_labels)
    tfidf_pre = LR_tfidf.predict(tfidf_test)
    tfidf_accuracy = accuracy_score(test_labels, tfidf_pre)
    
    return tfidf_accuracy

In [104]:
tfidf_result = logisticreg_tfidf(tfidf_train, tfidf_test, train_labels, test_labels)

In [105]:
tfidf_result

0.7

### SVM model

In [106]:
# grid search
def grid_search(BoW_train, BoW_test, train_labels, test_labels):
    gridsearch = []
    parameters = []

    C = [2**-5, 2**-4, 2**-3, 2**-2, 2**-1, 2**0, 2**1, 2**2, 2**3, 2**4, 2**5]
    kernel = ['linear', 'poly', 'rbf']
    
    for i in C:
        for j in kernel:
            parameters.append((i,j))
    
    for i in range(len(parameters)):
        SVM = svm.SVC(C = parameters[i][0], kernel = parameters[i][1])
        model = SVM.fit(BoW_train, train_labels)
        model_predict = model.predict(BoW_test)
        model_accuracy = accuracy_score(test_labels, model_predict)
            
        gridsearch.append((parameters[i][0], parameters[i][1], model_accuracy))
            
    return gridsearch

In [107]:
gridsearch_result = grid_search(BoW_train, BoW_test, train_labels, test_labels)

In [108]:
gridsearch_result

[(0.03125, 'linear', 0.78),
 (0.03125, 'poly', 0.62),
 (0.03125, 'rbf', 0.62),
 (0.0625, 'linear', 0.78),
 (0.0625, 'poly', 0.62),
 (0.0625, 'rbf', 0.62),
 (0.125, 'linear', 0.78),
 (0.125, 'poly', 0.62),
 (0.125, 'rbf', 0.62),
 (0.25, 'linear', 0.78),
 (0.25, 'poly', 0.62),
 (0.25, 'rbf', 0.62),
 (0.5, 'linear', 0.78),
 (0.5, 'poly', 0.62),
 (0.5, 'rbf', 0.62),
 (1, 'linear', 0.78),
 (1, 'poly', 0.62),
 (1, 'rbf', 0.74),
 (2, 'linear', 0.78),
 (2, 'poly', 0.62),
 (2, 'rbf', 0.72),
 (4, 'linear', 0.78),
 (4, 'poly', 0.62),
 (4, 'rbf', 0.84),
 (8, 'linear', 0.78),
 (8, 'poly', 0.62),
 (8, 'rbf', 0.88),
 (16, 'linear', 0.78),
 (16, 'poly', 0.62),
 (16, 'rbf', 0.88),
 (32, 'linear', 0.78),
 (32, 'poly', 0.62),
 (32, 'rbf', 0.88)]

In [132]:
# SVM for BoW
def svm_model(BoW_train, BoW_test, train_labels, test_labels):
    SVM = svm.SVC(C = 32, kernel = 'rbf')
    BoW_svm = SVM.fit(BoW_train, train_labels)
    
    SVM_predict = BoW_svm.predict(BoW_test)
    SVM_accuracy = accuracy_score(test_labels, SVM_predict)
    
    return SVM_accuracy

In [133]:
svm_result = svm_model(BoW_train, BoW_test, train_labels, test_labels)

In [134]:
svm_result

0.88