### Importing Libraries

In [1]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd

### Data Preprocessing

#### Import Dataset

In [2]:
dataset = pd.read_csv('../sentiment/imdb_labelled.txt', sep = '\t', names=['Review', 'Label'])
dataset.describe()

Unnamed: 0,Label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


## Creating Corpus

In [3]:
corpus = []
for i in range(0, 748):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # exclude numerics and special characters
    review = review.lower() # convert to lower case
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # remove stopwords and perform stemming
    review = ' '.join(review)
    corpus.append(review)
print(len(corpus))

748


In [4]:
corpus

['slow move aimless movi distress drift young man',
 'sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint becam even ridicul act poor plot line almost non exist',
 'littl music anyth speak',
 'best scene movi gerardo tri find song keep run head',
 'rest movi lack art charm mean empti work guess empti',
 'wast two hour',
 'saw movi today thought good effort good messag kid',
 'bit predict',
 'love cast jimmi buffet scienc teacher',
 'babi owl ador',
 'movi show lot florida best made look appeal',
 'song best muppet hilari',
 'cool',
 'right case movi deliv everyth almost right face',
 'averag act main person low budget clearli see',
 'review long overdu sinc consid tale two sister singl greatest film ever made',
 'put gem movi term screenplay cinematographi act post product edit direct aspect film make',
 'practic perfect true masterpiec sea faux masterpiec',
 'structur film easili tightli construct histori cinema think film so

## Bag of words Model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000) 
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

## Splitting the dataset into Training and Test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Using SVM to train and test the Model

In [7]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

## Test the Model

In [8]:
y_pred = classifier.predict(X_test)
for i in range(0,len(y_pred)):
    print(y_pred[i], y_test[i])

1 0
0 0
0 1
1 0
0 0
0 0
1 0
1 0
1 0
1 1
0 0
0 0
1 0
0 0
0 0
1 1
1 0
0 1
1 0
1 0
0 0
0 1
1 0
0 0
1 1
0 0
1 1
0 0
0 0
1 1
0 0
0 0
0 1
0 0
0 0
1 1
0 0
1 1
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 0
0 0
0 1
0 0
0 0
1 0
1 1
0 0
1 1
1 1
0 1
1 1
0 0
0 0
0 0
1 1
0 0
0 0
1 1
1 1
0 0
0 0
1 1
1 1
0 0
0 0
1 1
0 0
1 0
1 1
0 0
0 0
0 0
0 0
1 1
1 1
0 0
0 0
0 0
1 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 1
0 0
1 0
1 1
1 1
1 0
1 1
1 1
1 1
0 0
1 0
1 1
0 0
1 1
1 1
0 1
0 1
0 0
0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 0
0 1
0 1
1 1
1 1
0 0
0 0
1 1
1 0
1 1
1 1
0 1
0 1
1 0


## Validate the Model

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print("Confusion Matrix")
print(cm)
print("Accuracy: ", ac)

Confusion Matrix
[[64 20]
 [14 52]]
Accuracy:  0.7733333333333333
