In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("ajaykarthick/imdb-movie-reviews")

In [2]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 10000
    })
})


In [3]:
for i in range(5):
    print(f"Example {i}")
    print("Label:", ds["train"][i]["label"])
    print("Review preview:")
    print(ds["train"][i]["review"])
    print("----")


Example 0
Label: 0
Review preview:
Ms Aparna Sen, the maker of Mr & Mrs Iyer, directs this movie about a young girl's struggle to cope with her debilitating condition.<br /><br />Meethi (Konkona Sen) has been an aloof kid ever since childhood and has shown signs of delusion, no one knows why. The dormant tendency however slips out of control, when the job assignment takes her to neighboring Bihar where she's raped by some political goons. The resulting trauma also leads to episodes of manic-depressive psychosis in addition to her schizophrenia. She careens out of control over the years, progressively getting worse and sinking deeper into her private 'world'.<br /><br />The juxtaposition of an 'unsettled' (divorced) elder sister and how her domineering ways make an already bad situation worse, is indicative of what a fine line there is between abnormal and *seemingly normal*. Ms Sen also makes an excellent commentary on the social alienation of such individuals. Social rehab is standard

In [4]:
#check label distribution(dataset is balanced or not)
from collections import Counter

labels = ds["train"]["label"]
Counter(labels)


Counter({0: 20000, 1: 20000})

In [5]:
import pandas as pd
import numpy as np

# Convert HuggingFace dataset to DataFrame for easier handling
train_df = pd.DataFrame(ds["train"])
test_df  = pd.DataFrame(ds["test"])

In [6]:
import re

def extract_style_features(df):
    features = pd.DataFrame()
    features['num_exclamation'] = df['review'].apply(lambda x: x.count('!'))
    features['num_question'] = df['review'].apply(lambda x: x.count('?'))
    features['num_ellipsis'] = df['review'].apply(lambda x: x.count('...') + x.count('…'))
    # Count all-caps words (length >= 2 to avoid single letters)
    features['num_all_caps_words'] = df['review'].apply(lambda x: len(re.findall(r'\b[A-Z]{2,}\b', x)))
    return features
    
train_style_features = extract_style_features(train_df)
test_style_features = extract_style_features(test_df)

In [7]:
print(train_style_features.head())

   num_exclamation  num_question  num_ellipsis  num_all_caps_words
0                1             1             0                   4
1                0             0             0                  15
2                1             0             0                   0
3                4             0             0                   1
4                0             0             0                   0


In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from bs4 import BeautifulSoup
import html

stop_words = set(stopwords.words('english'))

def clean_text(text):

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove literal <br>, <br />, <p>, etc.
    text = re.sub(r'<br\s*/?>', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'<p\s*/?>', ' ', text, flags=re.IGNORECASE)
    # Decode HTML entities (&amp;, &quot;)
    text = html.unescape(text)
    # Remove weird unicode characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply to train/test reviews
train_df['clean_review'] = train_df['review'].apply(clean_text)
test_df['clean_review'] = test_df['review'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fatim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Show original and cleaned review side by side
compare_df = train_df[['review', 'clean_review']].head(5)  
pd.set_option('display.max_colwidth', None)  # show full text

compare_df


Unnamed: 0,review,clean_review
0,"Ms Aparna Sen, the maker of Mr & Mrs Iyer, directs this movie about a young girl's struggle to cope with her debilitating condition.<br /><br />Meethi (Konkona Sen) has been an aloof kid ever since childhood and has shown signs of delusion, no one knows why. The dormant tendency however slips out of control, when the job assignment takes her to neighboring Bihar where she's raped by some political goons. The resulting trauma also leads to episodes of manic-depressive psychosis in addition to her schizophrenia. She careens out of control over the years, progressively getting worse and sinking deeper into her private 'world'.<br /><br />The juxtaposition of an 'unsettled' (divorced) elder sister and how her domineering ways make an already bad situation worse, is indicative of what a fine line there is between abnormal and *seemingly normal*. Ms Sen also makes an excellent commentary on the social alienation of such individuals. Social rehab is standard therapy along with all the deadly mind-altering drugs. But what about the poor and the destitute, who're always left to fend for themselves and usually fall by the wayside?<br /><br />The romantic connection between Dr Kunal and Anu was unnecessary. Also the cafeteria scene where Dr Kunal explains to Anu how real their world really is to them, was redundant. Anu should already know all that. The English dialog is a bit awkward at times though the acting compensates for that. Konkona and Shabana prove that their reputation is every bit worth it. Waheeda, Rahul and Shefali play their limited roles very well. <br /><br />Extensive research seems to have been done about this illness, its very evident. But its not clear if MDP can coexist with schizophrenia in the same patient, side-by-side. Also in the early part, Dr Kunal recommends E.C.T (shock therapy) while invalidating the fact that it doesn't work for schizophrenics, only for extreme MDP with suicidal tendencies and other forms of bipolar disorder.<br /><br />The ending of the remarkable story is suggestive of an unknown solution (maybe no solution). The movie could have ended on a nicer note, since worldwide the mentally ill can and do lead balanced and fruitful if not very fulfilling, lives under good medical care.<br /><br />Nonetheless, its an excellent film made with extreme sensitivity to the subject. HATS OFF to Ms Sen! No one in India could've done it better.",ms aparna sen the maker of mr mrs iyer directs this movie about a young girls struggle to cope with her debilitating conditionmeethi konkona sen has been an aloof kid ever since childhood and has shown signs of delusion no one knows why the dormant tendency however slips out of control when the job assignment takes her to neighboring bihar where shes raped by some political goons the resulting trauma also leads to episodes of manicdepressive psychosis in addition to her schizophrenia she careens out of control over the years progressively getting worse and sinking deeper into her private worldthe juxtaposition of an unsettled divorced elder sister and how her domineering ways make an already bad situation worse is indicative of what a fine line there is between abnormal and seemingly normal ms sen also makes an excellent commentary on the social alienation of such individuals social rehab is standard therapy along with all the deadly mindaltering drugs but what about the poor and the destitute whore always left to fend for themselves and usually fall by the waysidethe romantic connection between dr kunal and anu was unnecessary also the cafeteria scene where dr kunal explains to anu how real their world really is to them was redundant anu should already know all that the english dialog is a bit awkward at times though the acting compensates for that konkona and shabana prove that their reputation is every bit worth it waheeda rahul and shefali play their limited roles very well extensive research seems to have been done about this illness its very evident but its not clear if mdp can coexist with schizophrenia in the same patient sidebyside also in the early part dr kunal recommends ect shock therapy while invalidating the fact that it doesnt work for schizophrenics only for extreme mdp with suicidal tendencies and other forms of bipolar disorderthe ending of the remarkable story is suggestive of an unknown solution maybe no solution the movie could have ended on a nicer note since worldwide the mentally ill can and do lead balanced and fruitful if not very fulfilling lives under good medical carenonetheless its an excellent film made with extreme sensitivity to the subject hats off to ms sen no one in india couldve done it better
1,"I have seen this film only once, on TV, and it has not been repeated. This is strange when you consider the rubbish that is repeated over and over again. Usually horror movies for me are a source of amusement, but this one really scared me.<br /><br />DO NOT READ THE NEXT BIT IF YOU HAVE'NT SEEN THE FILM YET<br /><br />The scariest bit is when the townsfolk pursue the preacher to where his wife lies almost dead (they'd been poisoning her). He asks who the hell are you people anyway. One by one they give their true identities. The girl who was pretending to be deaf in order to corrupt and seduce him says 'I am Lilith, the witch who loved Adam before Eve'.",i have seen this film only once on tv and it has not been repeated this is strange when you consider the rubbish that is repeated over and over again usually horror movies for me are a source of amusement but this one really scared medo not read the next bit if you havent seen the film yetthe scariest bit is when the townsfolk pursue the preacher to where his wife lies almost dead theyd been poisoning her he asks who the hell are you people anyway one by one they give their true identities the girl who was pretending to be deaf in order to corrupt and seduce him says i am lilith the witch who loved adam before eve
2,"I was only fourteen when I first saw the Alien movies and I immediately came to like it. Original, terrifying and classic. Sigourney Weaver was the perfect choice for the female hero character and she would have deserved a statuette for her act. In 1979 something everlasting was born than the immortal series continued with a nothing less legendary movie than the first. Alien3 was a different point of view but I think this part was the most stressful and unique of all, this was my favourite. Unfortunately the last one was a failure in many ways. It was strained, illogical with full of meaningless massacres. I didn't like it at all, but I never thought that a worse part would ever be made in the future. Well as it turned out in 2004 I was wrong. Alien vs. Predator was a bad break, and it should have been directed by a more talented director or should have never been made at all. But when I saw Alien vs Predator Requiem I was totally shocked moreover devastated. When I sat down and decided to watch it with full of doubt, even than I had never thought that such a bad movie could be made. Without a screenplay, without a director and without actors I don't understand how can a film be made. Because this film misses these three terms. What you get is a nice massacre show without a story but with a lot of annoying and boring dialogues. Waste of money and waste of time. This movie is rather impudence, than honor to the fans of the both sides (Alien/Predator). Shame!",i was only fourteen when i first saw the alien movies and i immediately came to like it original terrifying and classic sigourney weaver was the perfect choice for the female hero character and she would have deserved a statuette for her act in something everlasting was born than the immortal series continued with a nothing less legendary movie than the first alien was a different point of view but i think this part was the most stressful and unique of all this was my favourite unfortunately the last one was a failure in many ways it was strained illogical with full of meaningless massacres i didnt like it at all but i never thought that a worse part would ever be made in the future well as it turned out in i was wrong alien vs predator was a bad break and it should have been directed by a more talented director or should have never been made at all but when i saw alien vs predator requiem i was totally shocked moreover devastated when i sat down and decided to watch it with full of doubt even than i had never thought that such a bad movie could be made without a screenplay without a director and without actors i dont understand how can a film be made because this film misses these three terms what you get is a nice massacre show without a story but with a lot of annoying and boring dialogues waste of money and waste of time this movie is rather impudence than honor to the fans of the both sides alienpredator shame
3,"This marvelous short will hit home with everyone who, as a child, specifically asked for something because it was hip or cool, only to be given something that would mark you for life with your peers and were told by your Mom or Dad (or both) that it didn't matter, as you earnestly began considering enlisting in a Witness Protection Program in order to avoid ridicule. For those U.S. residents who don't get the horror because you don't follow hockey, it's like a Dallas Cowboy fan getting a Washington Redskins jersey or a Yankees fan getting a Red Sox jersey. It isn't pretty. For our European friends, think of two great rival football (soccer to us) clubs and imagine a fan of one getting a jersey from the other. Ouch!!! NFB of C outdid themselves here!<br /><br />Une hommage du Maurice 'Rocket' Richard, merci, M. Richard.",this marvelous short will hit home with everyone who as a child specifically asked for something because it was hip or cool only to be given something that would mark you for life with your peers and were told by your mom or dad or both that it didnt matter as you earnestly began considering enlisting in a witness protection program in order to avoid ridicule for those us residents who dont get the horror because you dont follow hockey its like a dallas cowboy fan getting a washington redskins jersey or a yankees fan getting a red sox jersey it isnt pretty for our european friends think of two great rival football soccer to us clubs and imagine a fan of one getting a jersey from the other ouch nfb of c outdid themselves hereune hommage du maurice rocket richard merci m richard
4,"If you are 10 years old and never seen a movie before, maybe this film may be entertainment for you, but if you've seen several movies, this one will be a silly fully-cliched cheap and predictable for you. Don't waste your time with this.",if you are years old and never seen a movie before maybe this film may be entertainment for you but if youve seen several movies this one will be a silly fullycliched cheap and predictable for you dont waste your time with this


In [18]:
tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['clean_review'])
X_test_tfidf  = tfidf_vectorizer.transform(test_df['clean_review'])

print("TF-IDF shape:", X_train_tfidf.shape)


TF-IDF shape: (40000, 20000)


In [19]:
from scipy.sparse import hstack



# Combine TF-IDF (sparse) + style features (dense)
X_train_combined = hstack([X_train_tfidf, train_style_features.values])
X_test_combined  = hstack([X_test_tfidf, test_style_features.values])

# Labels
y_train = train_df['label'].values
y_test  = test_df['label'].values

print("Combined feature matrix shape:", X_train_combined.shape)


Combined feature matrix shape: (40000, 20004)


In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create the model
nb_model = MultinomialNB()

# Train on combined features
nb_model.fit(X_train_combined, y_train)

# Predict on test set
y_pred = nb_model.predict(X_test_combined)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8346

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.86      0.84      5000
           1       0.85      0.81      0.83      5000

    accuracy                           0.83     10000
   macro avg       0.84      0.83      0.83     10000
weighted avg       0.84      0.83      0.83     10000


Confusion Matrix:
 [[4290  710]
 [ 944 4056]]


In [26]:
import pandas as pd

# Create a DataFrame for test set with predictions
test_results = pd.DataFrame({
    'review': test_df['review'],
    'actual': y_test,
    'predicted': y_pred
})

# Filter only wrong predictions
wrong_preds = test_results[test_results['actual'] != test_results['predicted']]

# Show first 5 wrong predictions
for i, row in wrong_preds.head(10).iterrows():
    print(f"Actual: {row['actual']} | Predicted: {row['predicted']}")
    print(f"Review preview: {row['review'][:300]}")  # show first 300 characters
    print("----")


Actual: 1 | Predicted: 0
Review preview: For the most part, 'Michael' is a disaster  ten minutes of charm and ninety's worth of missteps.<br /><br />Travolta and MacDowell do their best, frequently rising above Nora Ephron's numbingly banal script. But the film moves like a snail. And even within its fantasy context, the characters behave
----
Actual: 1 | Predicted: 0
Review preview: if you're a sucker for corny movies and are looking to see something you don't need to pay close attention to, this might be worth watching. the story itself is very unrealistic. the dialogue is also not very believable. it is doubtful you will find yourself relating to any of these characters becau
----
Actual: 0 | Predicted: 1
Review preview: As a long time fan of Peter O'Donnell's greatest creation, I watched this film on DVD with no great hopes of enjoyment; indeed I expected to be reaching in disgust for the remote control within fifteen minutes. But instead I thoroughly enjoyed this production, and 