In [1]:
import pickle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
import nltk
from random import sample
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, FeatureHasher
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

In [2]:
stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer()

gnb = GaussianNB()
bnb = BernoulliNB()
mnb = MultinomialNB()
svm = LinearSVC(max_iter=4000, random_state=0)

In [3]:
data = pd.read_csv('IMDB Dataset.csv')

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
dummies = pd.get_dummies(data['sentiment'], drop_first=True)
data = pd.concat([data,dummies], axis=1)
data = data.drop(['sentiment'],axis=1)

In [7]:
train = data.sample(n= 40000)
train

Unnamed: 0,review,positive
36693,Musical bios are all cut of the same cloth. Ho...,1
37331,"In the beginning of this film, one of the comm...",1
7720,"I only saw IPHIGENIA once, almost 30 years ago...",1
8697,Witty and disgusting. Brash and intelligent. B...,1
20938,"I've never been a big Larry Clark fan, but som...",0
...,...,...
7645,Luchino Visconti has become famous to the worl...,1
49755,"If you were ever a fan of MTV""s ""The State,"" t...",1
10680,Mario is invited to Princess Peach's castle fo...,1
24694,I have a deep liking for this film despite it ...,1


In [8]:
test = data.sample(n = 3000)
test

Unnamed: 0,review,positive
41972,After reading all of the rave reviews about th...,0
39157,I can remember seeing this movie as a kid in 1...,1
40097,I found this film embarrassing to watch. I fel...,0
8737,"Went to see the movie ""Troy"" this afternoon. H...",0
29380,There are a lot of pretentious people out ther...,0
...,...,...
26019,"As long as there's been 3d technology, (1950's...",0
27319,"""Footlight Parade"" is fascinating on so many l...",1
24154,I have seen virtually all of Cynthia Rothrock'...,1
27661,I first caught the movie on its first run on H...,1


In [9]:
def clear_text(df):
    all_reviews = []
    grp = df['review'].values.tolist()
    for sent in grp:
        sent = sent.lower()
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        sent = pattern.sub('', sent)
        sent = re.sub(r'[,.\"!@#$%^&*(){}?/;`~:<>+=-]', '', sent)
        tokens = nltk.word_tokenize(sent)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words('english'))
        stop_words.discard('not')
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews

train_reviews = clear_text(train)

In [10]:
cv = TfidfVectorizer(min_df = 4)
Xtr = cv.fit_transform(train_reviews).toarray()
Ytr = train['positive']

In [11]:
pickle.dump(cv,open('cv-transform.pkl','wb'))

In [15]:
svm.fit(Xtr, Ytr)

LinearSVC(max_iter=4000, random_state=0)

In [16]:
# pred = svm.predict(xtest)
# print(accuracy_score(ytest, pred))

In [17]:
# test_reviews = clear_text(test)

# Xte = cv.transform(test_reviews).toarray()
# Yte = test['positive']

In [18]:
# pre = svm.predict(Xte)
# print(accuracy_score(Yte,pre))

In [19]:

pickle.dump(svm,open('svm_model.pkl','wb'))