In [20]:
import pandas as pd 
data = pd.read_csv("data/IMDB Dataset.csv")

In [21]:
data.shape

(50000, 2)

In [22]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [23]:
data.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [24]:
import re
def preprocessor(text):
    text=re.sub('<[^>]*>','',text)
    emojis=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text=re.sub('[\W]+',' ',text.lower()) +\
    ' '.join(emojis).replace('-','')
    return text

In [25]:
data['review']=data['review'].apply(preprocessor)

In [26]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amit.kumar02\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from nltk.corpus import stopwords
stop=stopwords.words('english')

In [28]:
from nltk.stem.porter import PorterStemmer
porter=PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(strip_accents=None,
                      lowercase=False,
                      preprocessor=None,
                      tokenizer=tokenizer_porter,
                      use_idf=True,
                      norm='l2',
                      smooth_idf=True)

In [30]:
y=data.sentiment.values
x=tfidf.fit_transform(data.review)

In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.3,shuffle=False)

In [32]:
from sklearn.linear_model import LogisticRegressionCV
clf=LogisticRegressionCV(cv=6,scoring='accuracy',
                         random_state=0,
                         n_jobs=-1,
                         verbose=3,
                         max_iter=500).fit(X_train,y_train)

y_pred = clf.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  4.1min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  4.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  4.2min finished


In [33]:
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8986666666666666


In [34]:
from sklearn.linear_model import SGDClassifier
clf= SGDClassifier(loss='hinge', 
                   penalty='l2', 
                   alpha=0.0001, 
                   l1_ratio=0.15, 
                   fit_intercept=True, 
                   max_iter=1000, tol=0.001, 
                   shuffle=True, verbose=0, 
                   epsilon=0.1, n_jobs=None, 
                   random_state=None, 
                   learning_rate='optimal', 
                   eta0=0.0, power_t=0.5, 
                   early_stopping=False, 
                   validation_fraction=0.1, 
                   n_iter_no_change=5, 
                   class_weight=None, 
                   warm_start=False, 
                   average=False)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [35]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8956


In [46]:
from sklearn.linear_model import SGDClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

algorithm = {
#     "GaussianNB":GaussianNB(),
    "MultinomialNB":MultinomialNB(),
    "MultinomialNB":MultinomialNB(),
    "SGDClassifier":SGDClassifier(),
    "LogisticRegression": LogisticRegression()
}

for id,classifier in algorithm.items():
    clf = classifier
    print("Training for : "+id)
    model = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # Model Accuracy, how often is the classifier correct?
    print("Accuracy for"+id+":",metrics.accuracy_score(y_test, y_pred))

Training for : MultinomialNB
Accuracy forMultinomialNB: 0.8559333333333333
Training for : SGDClassifier
Accuracy forSGDClassifier: 0.8953333333333333
Training for : LogisticRegression
Accuracy forLogisticRegression: 0.8958
