In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import kagglehub
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path
file_path = "IMDB Dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
  file_path,
)

df.head()

In [None]:
# Basic Data Exploration
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

In [None]:
# Convert sentiment to binary labels
df['sentiment_label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

In [None]:
# Sentiment Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df, palette='viridis')
plt.title('Sentiment Distribution in IMDB Dataset')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.savefig('sentiment_distribution.png')
plt.show()

In [None]:
#  Review Length Distribution
df['review_length'] = df['review'].apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 6))
sns.histplot(df['review_length'], bins=50, kde=True, color='blue')
plt.title('Distribution of Review Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.savefig('review_length_distribution.png')
plt.show()

In [None]:
# Review Length by Sentiment
plt.figure(figsize=(10, 6))
sns.boxplot(x='sentiment', y='review_length', data=df, palette='Set2')
plt.title('Review Length by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Review Length (Words)')
plt.savefig('review_length_by_sentiment.png')
plt.show()

In [None]:
# Review Length by Sentiment (Violin Plot)
plt.figure(figsize=(10, 6))
sns.violinplot(x='sentiment', y='review_length', data=df, palette='Set3')
plt.title('Review Length Distribution by Sentiment (Violin Plot)')
plt.xlabel('Sentiment')
plt.ylabel('Review Length (Words)')
plt.savefig('review_length_violin.png')
plt.show()

In [None]:
#  Text Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)
df.head()

In [None]:
# Word Cloud for Positive Reviews
positive_reviews = ' '.join(df[df['sentiment'] == 'positive']['cleaned_review'])
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(positive_reviews)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Positive Reviews')
plt.axis('off')
plt.savefig('positive_wordcloud.png')
plt.show()

In [None]:
#  Word Cloud for Negative Reviews
negative_reviews = ' '.join(df[df['sentiment'] == 'negative']['cleaned_review'])
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(negative_reviews)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Negative Reviews')
plt.axis('off')
plt.savefig('negative_wordcloud.png')
plt.show()

In [None]:
# Top 10 Common Words in Positive Reviews
positive_words = ' '.join(df[df['sentiment'] == 'positive']['cleaned_review']).split()
positive_word_freq = Counter(positive_words)
common_positive = pd.DataFrame(positive_word_freq.most_common(10), columns=['Word', 'Frequency'])
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=common_positive, palette='Blues_d')
plt.title('Top 10 Common Words in Positive Reviews')
plt.savefig('top_positive_words.png')
plt.show()

In [None]:
# Top 10 Common Words in Negative Reviews
negative_words = ' '.join(df[df['sentiment'] == 'negative']['cleaned_review']).split()
negative_word_freq = Counter(negative_words)
common_negative = pd.DataFrame(negative_word_freq.most_common(10), columns=['Word', 'Frequency'])
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=common_negative, palette='Reds_d')
plt.title('Top 10 Common Words in Negative Reviews')
plt.savefig('top_negative_words.png')
plt.show()

In [None]:
#  Review Length KDE by Sentiment
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='review_length', hue='sentiment', fill=True, palette='Set1')
plt.title('Review Length Density by Sentiment')
plt.xlabel('Review Length (Words)')
plt.ylabel('Density')
plt.savefig('review_length_kde.png')
plt.show()

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment_label'], test_size=0.2, random_state=42)

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

In [None]:
# DistilBERT Model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
# Encode reviews for DistilBERT
def encode_reviews(reviews, max_length=128):
    encodings = tokenizer(reviews.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings

train_encodings = encode_reviews(X_train)
test_encodings = encode_reviews(X_test)

In [None]:
# Create PyTorch dataset
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDBDataset(train_encodings, y_train)
test_dataset = IMDBDataset(test_encodings, y_test)

In [None]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    report_to='none',
    fp16=True if torch.cuda.is_available() else False
)

In [None]:
from transformers import TrainerCallback

# Custom callback to track accuracy
class AccuracyCallback(TrainerCallback):
    def __init__(self):
        self.train_accuracies = []
        self.eval_accuracies = []

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Validation accuracy from metrics
        eval_logits = trainer.predict(test_dataset).predictions
        eval_preds = np.argmax(eval_logits, axis=1)
        eval_acc = accuracy_score(y_test, eval_preds)
        self.eval_accuracies.append(eval_acc)
        # Log validation accuracy
        print(f"Epoch {state.epoch}: Validation Accuracy = {eval_acc:.4f}")

    def on_epoch_end(self, args, state, control, **kwargs):
        # Training accuracy (approximate, using a small batch)
        train_logits = trainer.predict(train_dataset).predictions
        train_preds = np.argmax(train_logits, axis=1)
        train_acc = accuracy_score(y_train, train_preds)
        self.train_accuracies.append(train_acc)
        print(f"Epoch {state.epoch}: Training Accuracy = {train_acc:.4f}")

# Initialize callback
accuracy_callback = AccuracyCallback()

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[accuracy_callback]
)

In [None]:
# Train and evaluate
trainer.train()
eval_results = trainer.evaluate()
print("\nDistilBERT Evaluation Results:", eval_results)

In [None]:
#  Accuracy Over Epochs
epochs = range(1, len(accuracy_callback.eval_accuracies) + 1)

plt.figure(figsize=(10, 6))
plt.plot(epochs, accuracy_callback.train_accuracies + [None], label='Training Accuracy', marker='o', color='blue')
plt.plot(epochs, accuracy_callback.eval_accuracies, label='Validation Accuracy', marker='o', color='green')
plt.title('Training and Validation Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.savefig('accuracy_over_epochs.png')
plt.show()

In [None]:
# Predictions for evaluation
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, pred_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - DistilBERT')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.show()


In [None]:
# Classification Report Metrics
report = classification_report(y_test, pred_labels, output_dict=True)
report_df = pd.DataFrame(report).transpose()
plt.figure(figsize=(10, 6))
sns.heatmap(report_df.iloc[:-1, :3], annot=True, cmap='YlGnBu', cbar=False)
plt.title('Classification Report Metrics - DistilBERT')
plt.savefig('classification_report.png')
plt.show()

In [None]:
# Prediction Distribution
pred_df = pd.DataFrame({'Prediction': pred_labels})
plt.figure(figsize=(8, 6))
sns.countplot(x='Prediction', data=pred_df, palette='Set1')
plt.title('Distribution of Predicted Sentiments (DistilBERT)')
plt.xlabel('Predicted Sentiment (0: Negative, 1: Positive)')
plt.ylabel('Count')
plt.savefig('prediction_distribution.png')
plt.show()


In [None]:
# Save the model and tokenizer
model.save_pretrained('./distilbert_imdb_model')
tokenizer.save_pretrained('./distilbert_imdb_model')
print("\nModel and tokenizer saved to './distilbert_imdb_model'")

In [None]:
# Function for user input prediction
def predict_sentiment(review, model, tokenizer):
    # Preprocess the input review
    cleaned_review = preprocess_text(review)
    # Encode the review
    encodings = tokenizer([cleaned_review], truncation=True, padding=True, max_length=128, return_tensors='pt')
    # Move to GPU
    encodings = {key: val.to(device) for key, val in encodings.items()}
    # Predict
    with torch.no_grad():
        outputs = model(**encodings)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()
    return 'Positive' if prediction == 1 else 'Negative'

In [None]:
# User input loop
while True:
    user_review = input("\nEnter a movie review (or type 'exit' to quit): ")
    if user_review.lower() == 'exit':
        break
    if not user_review.strip():
        print("Please enter a valid review.")
        continue
    sentiment = predict_sentiment(user_review, model, tokenizer)
    print(f"Predicted Sentiment: {sentiment}")