In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd

df = pd.read_csv('IMDB Dataset.csv')

print("Dataset Head:")
print(df.head())
print("\nDataset Info:")
df.info()

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print("\nFirst 5 rows after mapping sentiment to numbers:")
print(df.head())

Dataset Head:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

First 5 rows after mapping sentiment to numbers:
                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spen

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Function to clean the review text.
    - Removes HTML tags
    - Removes punctuation and special characters
    - Converts to lowercase
    - Removes stop words
    """
    # 1. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # 2. Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # 3. Convert to lowercase
    text = text.lower()

    # 4. Tokenize the text (split into words)
    words = word_tokenize(text)

    # 5. Remove stop words
    cleaned_words = [word for word in words if word not in stop_words]

    return ' '.join(cleaned_words)

# Apply the cleaning function to the 'review' column
print("\nCleaning text data... this may take a moment.")
df['cleaned_review'] = df['review'].apply(clean_text)

print("Cleaning complete. Here's a comparison:")
print("\nOriginal Review:")
print(df['review'][0])
print("\nCleaned Review:")
print(df['cleaned_review'][0])


Cleaning text data... this may take a moment.
Cleaning complete. Here's a comparison:

Original Review:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
# max_features=5000 means we only consider the 5000 most frequent words
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Create the TF-IDF features (X) and the target labels (y)
X = tfidf_vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment']

print("\nShape of the TF-IDF feature matrix (reviews, vocabulary_size):")
print(X.shape)


Shape of the TF-IDF feature matrix (reviews, vocabulary_size):
(50000, 5000)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
print("\nTraining the Logistic Regression model...")
model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
model.fit(X_train, y_train)
print("Model training complete!")


Training the Logistic Regression model...
Model training complete!


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\nModel Accuracy: {accuracy:.4f}")
print(f"Model F1-Score: {f1:.4f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


Model Accuracy: 0.8924
Model F1-Score: 0.8947

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      4961
    Positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [None]:
def predict_sentiment(review_text):
    """
    Takes a new review and predicts its sentiment.
    """
    # 1. Clean the incoming text
    cleaned_text = clean_text(review_text)

    # 2. Convert the cleaned text to its TF-IDF representation
    # Note: Use 'transform', not 'fit_transform', as the vocabulary is already learned
    vectorized_text = tfidf_vectorizer.transform([cleaned_text])

    # 3. Predict using the trained model
    prediction = model.predict(vectorized_text)

    # 4. Return the sentiment
    return "Positive" if prediction[0] == 1 else "Negative"

# --- Example Usage ---
new_review_1 = "This was an absolutely fantastic movie. The acting was superb and the plot was thrilling!"
new_review_2 = "I was really disappointed. The story was boring and it felt way too long."

print("\n--- Testing with new reviews ---")
print(f"Review: '{new_review_1}'")
print(f"Predicted Sentiment: {predict_sentiment(new_review_1)}\n")

print(f"Review: '{new_review_2}'")
print(f"Predicted Sentiment: {predict_sentiment(new_review_2)}")


--- Testing with new reviews ---
Review: 'This was an absolutely fantastic movie. The acting was superb and the plot was thrilling!'
Predicted Sentiment: Positive

Review: 'I was really disappointed. The story was boring and it felt way too long.'
Predicted Sentiment: Negative
