In [2]:
!pip install kagglehub



In [3]:
import kagglehub

In [4]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print(path)

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
/kaggle/input/imdb-dataset-of-50k-movie-reviews


In [5]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv(path + "/IMDB Dataset.csv")
df.head()

df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

# cleaning the reviews text if they have any html tags or special characters
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove special characters and numbers
    text = text.lower()  # convert to lowercase
    return text

# cleaning the reviews
print("Cleaning the reviews...")
df['review'] = df['review'].apply(clean_text)
df.head()

Cleaning the reviews...


Unnamed: 0,review,sentiment,label
0,one of the other reviewers has mentioned that ...,positive,1
1,a wonderful little production the filming tec...,positive,1
2,i thought this was a wonderful way to spend ti...,positive,1
3,basically there s a family where a little boy ...,negative,0
4,petter mattei s love in the time of money is...,positive,1


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

# splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['label'], test_size=0.2, random_state=42)
print("Dataset split into training and testing sets.")
# creating a pipeline that first transforms the data using TF-IDF and then applies Multinomial Naive Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])
# training the model
print("Training the model...")
pipeline.fit(X_train, y_train)
print("Model trained.")


Dataset split into training and testing sets.
Training the model...
Model trained.


In [7]:
# cheking idf for some selected words
vectorizer = pipeline.named_steps['tfidf']
words_to_check = ['movie', 'film', 'terrible', 'excellent', 'the']
idf_values = vectorizer.idf_
word_to_idf = {word: idf for word, idf in zip(vectorizer.get_feature_names_out(), idf_values)}
for word in words_to_check:
    if word in word_to_idf:
        print(f"IDF for '{word}': {word_to_idf[word]}")
    else:
        print(f"'{word}' not found in the vocabulary.")



IDF for 'movie': 1.4929288488195736
IDF for 'film': 1.5886320009735266
IDF for 'terrible': 3.9150993702240418
IDF for 'excellent': 3.6359871243954616
IDF for 'the': 1.0088387267195897


In [8]:
y_pred = pipeline.predict(X_test)
print("Predictions made on the test set.")
print("Predicted labels:", y_pred.tolist()[:10])
# evaluating the model
print("Evaluating the model...")
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
# getting misclassified examples
misclassified_indices = np.where(y_test != y_pred)[0]
misclassified_examples = X_test.iloc[misclassified_indices]
print("Misclassified Examples:")
print(misclassified_examples[:10])
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Predictions made on the test set.
Predicted labels: [1, 1, 0, 1, 0, 1, 1, 0, 0, 0]
Evaluating the model...
Misclassified Examples:
49498    okay  i didn t get the purgatory thing the fir...
12144    i can t help but notice the negative reviews t...
33109    three kids are born during a solar eclipse and...
14644    eyeliner was worn nearly      years ago in egy...
17523    wasn t sure what to expect from this movie con...
24712    a ruthless assassin has been hired to eliminat...
43824    although this series and the mini film in part...
725      as a long time fan of all the star trek series...
2530     i actually didn t start watching the show unti...
36056    i think it s the sort of film you either love ...
Name: review, dtype: object
Accuracy: 0.8638
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      4961
           1       0.88      0.84      0.86      5039

    accuracy                           0.86    

In [9]:
def predict_sentiment(review):
    review = clean_text(review)
    prediction = pipeline.predict([review])
    return 'Positive Review' if prediction[0] == 1 else 'Negative Review'

print("Testing the sentiment prediction function...")
test_review = "I absolutely loved this movie! The plot was thrilling and the characters were well-developed."
predicted_sentiment = predict_sentiment(test_review)
print(f"Review: {test_review}\nPredicted Sentiment: {predicted_sentiment}")

Testing the sentiment prediction function...
Review: I absolutely loved this movie! The plot was thrilling and the characters were well-developed.
Predicted Sentiment: Positive Review


In [12]:
# analysis
confusion_matrix_result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_matrix_result)

Confusion Matrix:
[[4404  557]
 [ 805 4234]]


**Short description for my Classifier:**
I have used IMDB Dataset.csv dataset which contains around 60k negative and poitive reviews for movies and i have trained the model using it by MultinomialNB(multi-naive-bays classification method) which uses tf-idf vectorizer method (TfidfVectorizer) which gives for each word a value tf and idf i have tried to show some idf values for some word i assume they will have higher occurance in movie reviews.

for the training i get the confusion matrix
 [[4404  557]
 [ 805 4234]] which give me about 86.38% of accuracy, I have shown some 10 misclassified items in the above code.

