# Import Related Libraries and Dataset

In [None]:
pip install git+ssh://git@github.com/Debkumar000/text_preprocess_dkbera.git

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
import text_preprocess_dkbera as tp




In [4]:
df = pd.read_csv("D:\\Projects\\NLP Practice\\IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Cleaning and Preprocessing the Data

In [5]:
df['review'] = df['review'].apply(lambda x: tp.remove_accented_chars(x))
df['review'] = df['review'].apply(lambda x: tp.remove_emails(x))
df['review'] = df['review'].apply(lambda x: tp.remove_html_tags(x))
df['review'] = df['review'].apply(lambda x: tp.remove_urls(x))
df['review'] = df['review'].apply(lambda x: tp.cont_exp(x))
df['review'] = df['review'].apply(lambda x: tp.make_base(x))
df['review'] = df['review'].apply(lambda x: tp.remove_special_chars(x))
df['review'] = df['review'].apply(lambda x: str(x).lower())

# df['review'] = df['review'].apply(lambda x: tp.spell_correction(x))

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewer have mention that af...,positive
1,a wonderful little production the filming tech...,positive
2,i think this was a wonderful way to spend time...,positive
3,basically there is a family where a little boy...,negative
4,petter mattei s love in the time of money is a...,positive


In [9]:
df.shape

(50000, 2)

In [10]:
df = df.replace(['positive', 'negative'],[1, 0])

In [11]:
X=df["review"]
y=df["sentiment"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((33500,), (16500,), (33500,), (16500,))

# Model Building

# Logistic Regression

In [17]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='liblinear'))
])

In [18]:
hyperparameters = {
    'tfidf__max_df':(0.5, 1.0),
    'tfidf__ngram_range':((1,1),(1,2)),
    'tfidf__use_idf':(True, False),
    'tfidf__analyzer' : ('word', 'char', 'char_wb'),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (1,2)
}

In [21]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=1, cv=None)

In [22]:
%%time
clf.fit(X_train, y_train)





CPU times: total: 11h 13min 8s
Wall time: 11h 23min 44s


In [24]:
clf.best_score_

0.9015820895522388

In [25]:
clf.best_params_

{'clf__C': 2,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 2),
 'tfidf__use_idf': True}

In [27]:
clf.best_estimator_

In [28]:
y_pred = clf.predict(X_test)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      8250
           1       0.90      0.92      0.91      8250

    accuracy                           0.90     16500
   macro avg       0.91      0.90      0.90     16500
weighted avg       0.91      0.90      0.90     16500



# Support Vector Classifier

In [30]:
from sklearn.svm import LinearSVC

In [31]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [32]:
hyperparameters = {
    'tfidf__max_df':(0.5, 1.0),
    'tfidf__ngram_range':((1,1),(1,2)),
    'tfidf__use_idf':(True, False),
    'tfidf__analyzer' : ('word', 'char', 'char_wb'),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (1,2)
}

In [33]:
clf_svc = GridSearchCV(pipe, hyperparameters, n_jobs=1, cv=None)

In [34]:
%%time
clf_svc.fit(X_train, y_train)







CPU times: total: 3h 34min 42s
Wall time: 3h 38min 55s


In [35]:
clf_svc.best_score_

0.9125970149253732

In [36]:
clf_svc.best_params_

{'clf__C': 2,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 2),
 'tfidf__use_idf': True}

In [37]:
clf_svc.best_estimator_

In [38]:
y_pred_svc = clf_svc.predict(X_test)

In [39]:
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      8250
           1       0.91      0.92      0.92      8250

    accuracy                           0.92     16500
   macro avg       0.92      0.92      0.92     16500
weighted avg       0.92      0.92      0.92     16500



# Model Testing

In [48]:
rev1 = ["I love this movie, concept is supper. But the acting is not remarkable"]

In [49]:
clf_svc.predict(rev1)

array([1], dtype=int64)

In [51]:
rev2 = ["Not intersting, story is not good, acting is worst, VFX is not remarkable"]

In [52]:
clf_svc.predict(rev2)

array([0], dtype=int64)

# Save Model

In [47]:
import pickle as pkl

In [54]:
pkl.dump(clf_svc, open('svc_model.pkl', 'wb'))

In [55]:
pkl.dump(clf, open('LogisticReg_model.pkl', 'wb'))