In [13]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

In [2]:
dataset = pd.read_csv('../datasets/IMDB Dataset.csv')

In [3]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text

dataset['review']=dataset['review'].apply(denoise_text)

In [5]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii')

In [6]:
X = vectorizer.fit_transform(dataset.review)
y = dataset.sentiment
pickle.dump(vectorizer, open('../tranform.pkl', 'wb'))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [14]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [15]:
accuracy_score(y_test,lr.predict(X_test))*100

89.94

In [17]:
filename = '../nlp_model.pkl'
pickle.dump(lr, open(filename, 'wb'))