In [2]:
import pandas as pd
import numpy as np

df_imdb=pd.read_csv('./Datasets/IMDB_Dataset_cleaned.csv')
df_imdb.sample(10)

Unnamed: 0,review,sentiment
42398,"Terence Stamp can carry off anything, but this...",negative
32208,"Three zany couples, all SIX OF A KIND, become ...",positive
42767,When my 14-year-old daughter and her friends g...,positive
34757,The humor is non-existent in this loser of a m...,negative
40161,I am pretty surprised to see that this movie e...,negative
21669,This was a truly insipid film. The performance...,negative
47137,There's plenty to appreciate here: spectacular...,negative
38200,Eddie Fischer was simply bad. Possibly the wor...,negative
28775,A young Frenchman uproots himself as he become...,positive
16277,"Police, investigations, murder, suspicion: we ...",positive


import pandas as pd
import numpy as np

df_imdb=pd.read_csv('./Datasets/IMDB Dataset_cleaned.csv')

In [50]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ajf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ajf/nltk_data...


True

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer


# Initialize tokenizer, lemmatizer, and stemmer
tokenizer = word_tokenize
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
pattern = r'\s{2,}'
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters
    text=re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'[^a-zA-Z\s\'.?!,:;-]', '', text)
    text = re.sub(r'([^\w\s]|_)(?=\1)', '', text)
    text = re.sub(pattern, '', text)
    # Tokenize text
    tokens = tokenizer(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Stem tokens
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text
    return text

df_imdb['cleaned_text'] = df_imdb['review'].apply(preprocess_text)
df_imdb.to_csv('./Datasets/IMDB Dataset_cleaned.csv', index=False)

In [21]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and testing sets
label_encoder = LabelEncoder()
df_imdb['sentiment'] = label_encoder.fit_transform(df_imdb['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(df_imdb['cleaned_text'], df_imdb['sentiment'], test_size=0.3, random_state=42)

# Define pipelines
count_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])


# Fit the classifier to the training data (CountVectorizer)
tfidf_pipeline.fit(X_train, y_train)

# Make predictions on the testing data (CountVectorizer)
y_pred_count = tfidf_pipeline.predict(X_test)

# Evaluate the performance of the model (CountVectorizer)
accuracy_count = accuracy_score(y_test, y_pred_count)
precision_count = precision_score(y_test, y_pred_count, average='weighted')
recall_count = recall_score(y_test, y_pred_count, average='weighted')
f1_count = f1_score(y_test, y_pred_count, average='weighted')

# Fit the classifier to the training data (TfidfVectorizer)
count_pipeline.fit(X_train, y_train)

# Make predictions on the testing data (TfidfVectorizer)
y_pred_tfidf = count_pipeline.predict(X_test)

# Evaluate the performance of the model (TfidfVectorizer)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf, average='weighted')
recall_tfidf = recall_score(y_test, y_pred_tfidf, average='weighted')
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

# Compare the performance of models
print("CountVectorizer:")
print("Accuracy:", accuracy_count)
print("Precision:", precision_count)
print("Recall:", recall_count)
print("F1 Score:", f1_count)

print("\nTfidfVectorizer:")
print("Accuracy:", accuracy_tfidf)
print("Precision:", precision_tfidf)
print("Recall:", recall_tfidf)
print("F1 Score:", f1_tfidf)

import joblib
joblib.dump(tfidf_pipeline, 'tfidf_pipeline.joblib')

CountVectorizer:
Accuracy: 0.8588666666666667
Precision: 0.8593527190443919
Recall: 0.8588666666666667
F1 Score: 0.8588535070109312

TfidfVectorizer:
Accuracy: 0.8534
Precision: 0.8539029449817557
Recall: 0.8534
F1 Score: 0.8533849778813225


In [13]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np


# Perform 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define the scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Store the scores for each metric
scores = {metric: cross_val_score(tfidf_pipeline, df_imdb['cleaned_text'], df_imdb['sentiment'], cv=kf, scoring=metric) for metric in scoring}

# Compute average and standard deviation for each metric
avg_scores = {metric: np.mean(scores[metric]) for metric in scoring}
std_scores = {metric: np.std(scores[metric]) for metric in scoring}

# Print results
for metric in scoring:
    print("Average {}: {:.4f}".format(metric, avg_scores[metric]))
    print("{} Standard Deviation: {:.4f}".format(metric, std_scores[metric]))

Average accuracy: 0.8611
accuracy Standard Deviation: 0.0042
Average precision: 0.8696
precision Standard Deviation: 0.0083
Average recall: 0.8495
recall Standard Deviation: 0.0082
Average f1: 0.8594
f1 Standard Deviation: 0.0045


In [14]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np


# Perform 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define the scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Store the scores for each metric
scores = {metric: cross_val_score(count_pipeline, df_imdb['cleaned_text'], df_imdb['sentiment'], cv=kf, scoring=metric) for metric in scoring}

# Compute average and standard deviation for each metric
avg_scores = {metric: np.mean(scores[metric]) for metric in scoring}
std_scores = {metric: np.std(scores[metric]) for metric in scoring}

# Print results
for metric in scoring:
    print("Average {}: {:.4f}".format(metric, avg_scores[metric]))
    print("{} Standard Deviation: {:.4f}".format(metric, std_scores[metric]))

Average accuracy: 0.8551
accuracy Standard Deviation: 0.0037
Average precision: 0.8692
precision Standard Deviation: 0.0069
Average recall: 0.8360
recall Standard Deviation: 0.0059
Average f1: 0.8522
f1 Standard Deviation: 0.0038


In [27]:
import joblib
a=joblib.load('tfidf_pipeline.joblib')
predicted_sentiment = []
challenging_comments = [
    "I love it",
    "This movie was absolutely terrible. I couldn't stand it.",
    "I thought this film would be good, but it turned out to be a disappointment.",
    "The acting was mediocre, and the plot was predictable.",
    "I wasn't expecting much from this movie, but it exceeded my expectations.",
    "The cinematography was stunning, but the storyline fell flat.",
    "I found the characters unlikable, and the dialogue was cringeworthy.",
    "Despite its flaws, I found myself thoroughly entertained by this film.",
    "This movie was a waste of time. I regret watching it.",
    "The pacing was off, and the editing was choppy.",
    "I was pleasantly surprised by how much I enjoyed this film."
]
for comment in challenging_comments:
    temp=label_encoder.inverse_transform(a.predict([comment]))
    print(temp)
    predicted_sentiment.append(temp)


['positive']
['negative']
['negative']
['negative']
['negative']
['negative']
['negative']
['negative']
['negative']
['negative']
['negative']
