In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk

from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

In [2]:
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.shape

(50000, 2)

In [4]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [5]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [6]:
data.replace({'sentiment':{'positive':1 , 'negative':0}}, inplace=True)

  data.replace({'sentiment':{'positive':1 , 'negative':0}}, inplace=True)


In [7]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


In [8]:
port_stem = PorterStemmer()

In [9]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
def stemming(content):
    stemmingcontent = re.sub('[^a-zA-Z]',' ',content)
    stemmingcontent = stemmingcontent.lower()
    stemmingcontent = stemmingcontent.split()
    stemmingcontent = [port_stem.stem(word) for word in stemmingcontent if not word in stopwords.words('english')]
    stemmingcontent = ' '.join(stemmingcontent)
    return stemmingcontent

In [11]:
data['review'] = data['review'].apply(stemming)

In [12]:
data['review'].head()

Unnamed: 0,review
0,one review mention watch oz episod hook right ...
1,wonder littl product br br film techniqu unass...
2,thought wonder way spend time hot summer weeke...
3,basic famili littl boy jake think zombi closet...
4,petter mattei love time money visual stun film...


In [32]:
vectorizer = TfidfVectorizer()

In [33]:
X = data['review']
y = data['sentiment']

In [34]:
X.shape

(50000,)

In [35]:
y.shape

(50000,)

In [36]:
y.head()

Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [19]:
X_train.shape, X_test.shape

((40000,), (10000,))

In [37]:
train_vector = vectorizer.fit_transform(X_train)
test_vector = vectorizer.transform(X_test)

In [39]:
train_vector.shape

(40000, 62591)

In [40]:
log_reg = LogisticRegression()

In [41]:
log_reg.fit(train_vector, y_train)

In [42]:
log_reg_train_pred = log_reg.predict(train_vector)

In [43]:
accuracy_score(log_reg_train_pred, y_train)

0.9251

In [44]:
log_reg_test_pred = log_reg.predict(test_vector)

In [45]:
accuracy_score(log_reg_test_pred, y_test)

0.8917

In [58]:
import joblib

# Save the trained model to a file
joblib.dump(log_reg, 'logistic_regression_model.pkl')


['logistic_regression_model.pkl']

In [46]:
from sklearn.svm import SVC

In [49]:
svc = SVC()

In [50]:
svc.fit(train_vector, y_train)

In [52]:
svm_pred_train = svc.predict(train_vector)

In [53]:
accuracy_score(svm_pred_train, y_train)

0.986825

In [54]:
svm_pred_test =svc.predict(test_vector)

In [55]:
accuracy_score(svm_pred_test, y_test)

0.896

In [59]:
import joblib

# Save the trained model to a file
joblib.dump(svc, 'svc_model.pkl')

['svc_model.pkl']