<a href="https://colab.research.google.com/github/BalkrishanSingh/TR102-2302492/blob/main/Project/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install requests beautifulsoup4



In [2]:
import pandas as pd
import numpy as np
import warnings

import joblib

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("./IMDB Dataset.csv")

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data.shape

(50000, 2)

In [6]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [7]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [8]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:

train_data, test_data = train_test_split(data,
                                         test_size=0.2,
                                         random_state=42,
                                         stratify=data['sentiment'])

In [10]:
train_data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,20000
0,20000


In [18]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

X_train_sequences = tokenizer.texts_to_sequences(train_data["review"])
X_test_sequences = tokenizer.texts_to_sequences(test_data["review"])


review_lengths = [len(x) for x in X_train_sequences]

mean_length = np.mean(review_lengths)
median_length = np.median(review_lengths)
percentile_95 = np.percentile(review_lengths, 95)

print("--- Analysis of Original Review Lengths ---")
print(f"Average review length: {mean_length:.0f} words")
print(f"Median review length: {median_length:.0f} words")
print(f"95th percentile length: {percentile_95:.0f} words\n")

--- Analysis of Original Review Lengths ---
Average review length: 212 words
Median review length: 161 words
95th percentile length: 532 words



In [19]:
OPTIMAL_MAXLEN = int(percentile_95)
print(f"--- Setting optimal maxlen to {OPTIMAL_MAXLEN} ---")

X_train = pad_sequences(X_train_sequences, maxlen=OPTIMAL_MAXLEN)
X_test = pad_sequences(X_test_sequences, maxlen=OPTIMAL_MAXLEN)

print("\nData successfully padded. Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

--- Setting optimal maxlen to 532 ---

Data successfully padded. Shape of X_train: (40000, 532)
Shape of X_test: (10000, 532)


In [20]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [21]:

model = Sequential([
    # Input Layer
    Embedding(input_dim=5000, output_dim=128, input_length=OPTIMAL_MAXLEN),

    Dropout(0.2),https://www.imdb.com/title/tt3498820/review/rw6764079/?ref_=tturv_8

    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),

    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),


    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [22]:

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train,
    Y_train,
    epochs=20,
    batch_size=64,
    validation_data=(X_test, Y_test),
    callbacks=[early_stopping]
)

loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f'\nTest Accuracy: {accuracy*100:.2f}%')

Epoch 1/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 523ms/step - accuracy: 0.7543 - loss: 0.4616 - val_accuracy: 0.8988 - val_loss: 0.2505
Epoch 2/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 519ms/step - accuracy: 0.9085 - loss: 0.2337 - val_accuracy: 0.8973 - val_loss: 0.2459
Epoch 3/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 519ms/step - accuracy: 0.9323 - loss: 0.1852 - val_accuracy: 0.9001 - val_loss: 0.2527
Epoch 4/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 498ms/step - accuracy: 0.9466 - loss: 0.1472 - val_accuracy: 0.8987 - val_loss: 0.2641
Epoch 5/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 523ms/step - accuracy: 0.9608 - loss: 0.1171 - val_accuracy: 0.8997 - val_loss: 0.2888
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.

Test Accuracy: 89.73%


In [23]:
model.save("model.keras")

In [24]:
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [6]:
import gradio as gr
import numpy as np
import re
import requests
import joblib
import json
from bs4 import BeautifulSoup
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

OPTIMAL_MAXLEN = 532

# Load the trained model and tokenizer
try:
    model = load_model("./model.keras")
    tokenizer = joblib.load("./tokenizer.pkl")
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    # Exit if we can't load the essential files
    exit()





Model and tokenizer loaded successfully.


In [2]:
def clean_text(text: str) -> str:
    """Applies basic cleaning to the review text."""
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A) # Keep only letters and spaces
    text = text.lower()
    text = text.strip()
    return text

In [3]:
def scrape_imdb_review(url: str) -> dict:
    """
    Scrapes a single review page from IMDb by parsing the embedded JSON data.
    """
    match = re.search(r'(rw\d+)', url)
    if not match:
        return {'error': 'Could not find a valid review ID in the URL.'}

    review_id = match.group(1)
    review_url = f"https://www.imdb.com/review/{review_id}/"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }

    try:
        response = requests.get(review_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the script tag containing the page's data
        script_tag = soup.find('script', {'id': '__NEXT_DATA__'})

        if not script_tag:
            return {'error': 'Could not find the __NEXT_DATA__ JSON block on the page.'}

        # Extract the JSON content from the script tag
        json_data = json.loads(script_tag.string)

        # Navigate through the JSON to find the review text
        # This path is specific to how IMDb structures its data
        review_text_html = json_data['props']['pageProps']['reviewData']['text']['originalText']['plaidHtml']

        if not review_text_html:
            return {'error': 'Review content was empty in the JSON data.'}

        # The extracted text contains HTML tags like <br/>, so we use BeautifulSoup again
        # just to clean it up and get the plain text.
        text_soup = BeautifulSoup(review_text_html, 'html.parser')
        plain_text = text_soup.get_text(separator=' ', strip=True)

        return {'content': plain_text}

    except requests.exceptions.RequestException as e:
        return {'error': f"Request failed: {e}"}
    except (KeyError, TypeError):
        return {'error': 'Failed to parse JSON. The structure of the page data may have changed.'}
    except Exception as e:
        return {'error': f"An unexpected error occurred: {e}"}

In [4]:
def predictive_system(review_url: str) -> str:
    """
    Takes an IMDb URL, scrapes it, and predicts the sentiment of the review.
    """
    # 1. Scrape the review from the URL
    review_data = scrape_imdb_review(review_url)

    # Handle scraping errors
    if 'error' in review_data:
        return f"Scraping Failed: {review_data['error']}"

    # 2. Preprocess the scraped text
    review_text = review_data['content']
    cleaned_review = clean_text(review_text)

    # 3. Tokenize and Pad the sequence
    sequences = tokenizer.texts_to_sequences([cleaned_review])
    padded_sequence = pad_sequences(sequences, maxlen=OPTIMAL_MAXLEN)

    # 4. Make a prediction
    prediction = model.predict(padded_sequence)
    confidence = prediction[0][0]

    # 5. Format the output
    if confidence > 0.5:
        sentiment = "Positive"
        confidence_percent = confidence * 100
    else:
        sentiment = "Negative"
        confidence_percent = (1 - confidence) * 100

    return f"Sentiment: {sentiment}\nConfidence: {confidence_percent:.2f}%"


In [None]:
title = "IMDb Movie Review Sentiment Analysis"
description = "Enter the URL of an IMDb movie review to analyze its sentiment. The model will predict whether the review is positive or negative and provide a confidence score."
examples = [
    ["https://www.imdb.com/review/rw5399128/"], # Example of a positive review
    ["https://www.imdb.com/title/tt3498820/review/rw7180110/?ref_=tturv_1"]  # Example of a negative review
]

app = gr.Interface(
    fn=predictive_system,
    inputs=gr.Textbox(lines=2, label="IMDb Review URL"),
    outputs=gr.Textbox(label="Analysis Result"),
    title=title,
    description=description,
    examples=examples
)

# Launch the web application
app.launch(share=True)