In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
#Importing dataset and replacing labels with 0 and 1 for classification
df = pd.read_csv('IMDB Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
#Defining stop_words and lemmatizer
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [None]:
#Defining clean_text function
def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [None]:
#Creating new column for processed reviews
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

In [None]:
##Deploying SVM model on available data

#Importing libraries
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:

#Defining input and target variable
x = df['Processed_Reviews']
y = df['sentiment']

#Training and splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
#Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer().fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

In [None]:
#instantiate the model (using the default parameters)
SVM = SVC()

In [None]:
# fit the model with pre-processed data
SVM.fit(bow_train, y_train)

SVC()

In [None]:
#perform classification and prediction on samples in tf_test
predicted_SVM = SVM.predict(bow_test)
print(classification_report(y_test, predicted_SVM))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      5035
           1       0.86      0.89      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
##Finding best parameters for Count Vectorizer and SVM model


#Importing libraries
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#Creating a Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('SVM', SVC())
])


In [None]:

#Defining hyperparameters
parameters = {
    'vect__max_df':[0.1,0.2,0.3,0.4,0.5,0.6,0.7],
    'vect__ngram_range':  [(1,1), (1,2), (1,3)],
    'SVM__kernel': ['poly', 'rbf', 'sigmoid'],
    'SVM__C': [50, 10, 1.0, 0.1, 0.01]}

In [20]:
# define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(pipeline, param_grid=parameters, refit = True, verbose = 3, cv=5)
grid_result = grid_search.fit(df.loc[:500, 'Processed_Reviews'].values.astype('U'), df.loc[:500, 'sentiment'].values.astype('U'))

Fitting 5 folds for each of 315 candidates, totalling 1575 fits
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.574 total time=   0.2s
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.520 total time=   0.2s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.540 total time=   0.2s
[CV 4/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.570 total time=   0.2s
[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 1);, score=0.520 total time=   0.2s
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.574 total time=   0.6s
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.520 total time=   0.6s
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.1, vect__ngram_range=(1, 2);, score=0.540 total time=

In [21]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.792495 using {'SVM__C': 10, 'SVM__kernel': 'sigmoid', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 3)}
0.544851 (0.023477) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 1)}
0.542851 (0.017510) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 2)}
0.509050 (0.030670) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.1, 'vect__ngram_range': (1, 3)}
0.558792 (0.026173) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 1)}
0.562772 (0.033053) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 2)}
0.515030 (0.031595) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.2, 'vect__ngram_range': (1, 3)}
0.576772 (0.028265) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect__ngram_range': (1, 1)}
0.618515 (0.092858) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect