# Classifying IMDB movie reviews

In [44]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import utils

In [3]:
movies = pd.read_csv('./input/IMDB_Dataset.csv')
movies.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
movies.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
# Model with a Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [13]:
#Define a TQDM wrapper for CountVectorizer
from tqdm import tqdm
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split

class TQDMVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.vectorizer = CountVectorizer(*args, **kwargs)
    
    def fit(self, X, y=None):
        self.vectorizer.fit(tqdm(X, desc="Vectorizing"))
        return self
    
    def transform(self, X, y=None):
        return self.vectorizer.transform(tqdm(X, desc="Transforming"))
    
    def fit_transform(self, X, y=None):
        return self.vectorizer.fit_transform(tqdm(X, desc="Fitting and transforming"))
    

# Split the data into a training+validation, and test sets
X_train_valid, X_test, y_train_valid, y_test = train_test_split(movies['review'], movies['sentiment'], test_size=0.1, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.1, random_state=42)

# Create a pipeline with TQDMVectorizer and Logistic Regression
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('vectorizer', TQDMVectorizer(max_features=10000)),
    ('classifier', LogisticRegression())
])

# Fit the model with a progress bar
model.fit(X_train, y_train)

# Evaluate the model with a progress bar
#y_pred = model.predict(X_valid)
y_pred = model.predict(tqdm(X_valid, desc='Predicting on validation set'))
#accuracy = (y_valid_pred == y_valid).mean()
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Evaluate the model with test-data, with a progress bar - ALTHOUGH YOU never use test data!
y_pred = model.predict(X_test)
#y_pred = model.predict(tqdm(X_test, desc='Predicting on test set'))
#accuracy = (y_test_pred == y_test).mean()
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Fitting and transforming: 100%|██████████| 40500/40500 [00:02<00:00, 15629.82it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Predicting on validation set: 100%|██████████| 4500/4500 [00:00<00:00, 14216.06it/s]
Transforming: 100%|██████████| 4500/4500 [00:00<00:00, 14234.94it/s]


Accuracy: 0.89


Transforming: 100%|██████████| 5000/5000 [00:00<00:00, 14357.21it/s]

Accuracy: 0.89





In [24]:
# Get coefficients of the model
coefficients = model.named_steps['classifier'].coef_.flatten()
# Get the feature names
feature_names = model.named_steps['vectorizer'].vectorizer.get_feature_names_out()

# Create a DataFrame with the coefficients and feature names
coefficients_df = pd.DataFrame({
    'coefficient': coefficients,
    'word': feature_names
})
# Sort the DataFrame by the coefficients
coefficients_df = coefficients_df.sort_values('coefficient')

# Display the 10 most negative and 10 most positive words
print("10 most negative words")
print(coefficients_df.head(10))
print("\n10 most positive words")
print(coefficients_df.tail(10))


10 most negative words
      coefficient            word
9695    -2.546595           waste
2641    -1.797144  disappointment
768     -1.785317           awful
9913    -1.708402           worst
2863    -1.602802            dull
2640    -1.591126   disappointing
5083    -1.587477           lacks
6731    -1.520964          poorly
3336    -1.497179           fails
5093    -1.448971            lame

10 most positive words
      coefficient         word
3747     1.164659     funniest
899      1.170954  beautifully
3079     1.173136    enjoyable
9882     1.182645  wonderfully
8754     1.216493    surprised
3810     1.234074          gem
8664     1.342706       subtle
8274     1.365383        solid
8721     1.491892       superb
7260     1.504089   refreshing


In [25]:
coefficients_df[coefficients_df['word']=='wonderful']

Unnamed: 0,coefficient,word
9881,0.831205,wonderful


In [26]:
movies['predictions'] = model.predict(movies['review'])

Transforming: 100%|██████████| 50000/50000 [00:03<00:00, 15328.54it/s]


In [41]:
movies_sorted = movies.sort_values(by='predictions', ascending=False)
pd.set_option('display.max_colwidth', 100)
movies_sorted


Unnamed: 0,review,sentiment,predictions
0,One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked....,positive,positive
20804,"That's right. The movie is better than the book. Don't get me wrong, I love the book. But the mo...",positive,positive
20768,"Based on an actual mining disaster, this early German talkie (with English subtitles) still rema...",positive,positive
20767,This particular episode of Smallville is probably the best episode to air since reunion. This is...,positive,positive
20766,"With the release of Peter Jackson's famed ""Lord of the Rings"" trilogy, it is even easier to dism...",positive,positive
...,...,...,...
22226,The Motion Picture Association of America has seen fit to advise potential viewers and this is p...,negative,negative
22224,When I refer to Malice as a film noir I am not likening it to such masterpieces as Sunset Boulev...,negative,negative
22223,"This film deals with two ex-football players who are Fred Williamson, (Mack Derringer) and Gary ...",negative,negative
22221,"Oh dear Gods, this is awful. Stay away, just stay away. If you think you've seen bad movies, thi...",negative,negative


In [42]:
# Find all rows with false prediction and print the full review
false_negatives = movies[(movies['sentiment'] == 'positive') & (movies['predictions'] == 'negative')]
pd.set_option('display.max_colwidth', None)
print("False negatives:")
false_negatives.head(5)

False negatives:


Unnamed: 0,review,sentiment,predictions
6,"I sure would like to see a resurrection of a up dated Seahunt series with the tech they have today it would bring back the kid excitement in me.I grew up on black and white TV and Seahunt with Gunsmoke were my hero's every week.You have my vote for a comeback of a new sea hunt.We need a change of pace in TV and this would work for a world of under water adventure.Oh by the way thank you for an outlet like this to view many viewpoints about TV and the many movies.So any ole way I believe I've got what I wanna say.Would be nice to read some more plus points about sea hunt.If my rhymes would be 10 lines would you let me submit,or leave me out to be in doubt and have me to quit,If this is so then I must go so lets do it.",positive,negative
16,"Some films just simply should not be remade. This is one of them. In and of itself it is not a bad film. But it fails to capture the flavor and the terror of the 1963 film of the same title. Liam Neeson was excellent as he always is, and most of the cast holds up, with the exception of Owen Wilson, who just did not bring the right feel to the character of Luke. But the major fault with this version is that it strayed too far from the Shirley Jackson story in it's attempts to be grandiose and lost some of the thrill of the earlier film in a trade off for snazzier special effects. Again I will say that in and of itself it is not a bad film. But you will enjoy the friction of terror in the older version much more.",positive,negative
52,"Bela Lugosi appeared in several of these low budget chillers for Monogram Studios in the 1940's and The Corpse Vanishes is one of the better ones.<br /><br />Bela plays a mad scientist who kidnaps young brides and kills them and then extracts fluid from their bodies so he can keep his ageing wife looking young. After a reporter and a doctor stay the night at his home and discover he is responsible for the brides' deaths, the following morning they report these murders to the police and the mad scientist is shot and drops dead shortly afterwards.<br /><br />You have got almost everything in this movie: the scientist's assistants consist of an old hag, a hunchback and dwarf (her sons), a thunderstorm and spooky passages in Bela's house. Bela and his wife find they sleep better in coffins rather than beds in the movie.<br /><br />The Corpse Vanishes is worth a look, especially for Bela Lugosi fans. Great fun.<br /><br />Rating: 3 stars out of 5.",positive,negative
72,"I thought that Mukhsin has been wonderfully written. Its not just about entertainment. There's tonnes of subtle messages that i think Yasmin was trying to bring across. And yes, it might be confusing to some of you(especially if you didn't watch Sepet and/or Gubra for 76 times).<br /><br />I bet u noticed how they use characters from the two movies before right? Its really ironic how the characters relate. Like the bossy neighbour is that prostitute from Gubra. And the chick at the snooker pad turns out to be the religious and wife of the pious man in the future. <br /><br />And i absolutely love the voice-overs. Its crude yet awakeningly fresh. Like, when they took a shot of the Rumah Tumpangan Gamin signboard, then there was suddenly Mukhsin's voice saying 'Bismillahhirrahmannirrahim..' (the scene when he climbed the tree).<br /><br />It captured Malaysian's attitude(and in some mild way, sniggering at how pathetic it is) portrayed in the character. For example, even the kids can be really sharp tongued(complete with the shrill annoying voice) and simply bad mouth ppl all movie long. And how you can be such a busybody and talk about ppl, when ur own life isn't sorted out. <br /><br />All i can say is, this movie totally reached my expectation if not exceeded it. <br /><br />It kept me glued to the screen, i couldn't even take my eyes off it. Not even to make out in the cinema. Ha ha.",positive,negative
103,"No, this hilariously horrible 70's made-for-TV horror clinker isn't about a deadly demonically possessed dessert cake. Still, this exceptionally awful, yet undeniably amusing and thus enjoyable cathode ray refuse reaches a breathtaking apex of absolute, unremitting silliness and atrociousness that's quite tasty in a so-execrable-it's-downright-awesome sort of way. Richard Crenna, looking haggard and possibly inebriated, and Yvette Mimieux, who acts as if she never got over the brutal rape she endured in ""Jackson County Jail,"" sluggishly portray a disgustingly nice and respectable suburbanite couple whose quaint, dull, sleepy small town existence gets ripped asunder when the cute German Shepard they take in as the family pet turns out to be some ancient lethal evil spirit. Pretty soon Mimieux and her two repellently cutesy kids Kim Richards and Ike Eisenmann (the psychic alien moppets from the Disney ""Witch Mountain"" pictures) are worshiping a crude crayon drawing of the nasty, ugly canine entity in the den. Boy, now doesn't that sound really scary and disturbing? Well, scary and disturbing this laughably ludicrous claptrap sure ain't, but it sure is funny, thanks to Curtis (""Night Tide"") Harrington's hopelessly weak direction, cartoonish (not so) special effects, an almost painfully risible'n'ridiculous plot, and a game cast that struggles valiantly with the absurd story (besides the leads, both Martine Beswicke and R.G. Armstrong briefly pop up as members of a Satanic cult and Victor Jory has a nice cameo as a helpful Native American shaman). Favorite scene: the malicious Mephestophelion mutt puts the whammy on Crenna, practically forcing him to stick his hand into a wildly spinning lawnmower blade. While stuck-up snobby fright film fans may hold their noses at the perfectly putrid stench of this admittedly smelly schlock, devout TV trash lovers should deem this endearingly abominable offal the boob tube equivalent to Alpo.",positive,negative


In [43]:
# Find all rows with false prediction and print the full review
false_positives = movies[(movies['sentiment'] == 'negative') & (movies['predictions'] == 'positive')]
pd.set_option('display.max_colwidth', None)
print("False positives:")
false_positives.head(5)

False positives:


Unnamed: 0,review,sentiment,predictions
13,"The cast played Shakespeare.<br /><br />Shakespeare lost.<br /><br />I appreciate that this is trying to bring Shakespeare to the masses, but why ruin something so good.<br /><br />Is it because 'The Scottish Play' is my favorite Shakespeare? I do not know. What I do know is that a certain Rev Bowdler (hence bowdlerization) tried to do something similar in the Victorian era.<br /><br />In other words, you cannot improve perfection.<br /><br />I have no more to write but as I have to write at least ten lines of text (and English composition was never my forte I will just have to keep going and say that this movie, as the saying goes, just does not cut it.",negative,positive
49,"Average (and surprisingly tame) Fulci giallo which means it's still quite bad by normal standards, but redeemed by its solid build-up and some nice touches such as a neat time twist on the issues of visions and clairvoyance.<br /><br />The genre's well-known weaknesses are in full gear: banal dialogue, wooden acting, illogical plot points. And the finale goes on much too long, while the denouement proves to be a rather lame or shall I say: limp affair.<br /><br />Fulci's ironic handling of giallo norms is amusing, though. Yellow clues wherever you look.<br /><br />3 out of 10 limping killers",negative,positive
67,"I really like Salman Kahn so I was really disappointed when I seen this movie. It didn't have much of a plot and what they did have was not that appealing. Salman however did look good in the movie looked young and refreshed but was worth the price of this DVD. The music was not bad it was quite nice. Usually Indian movies are at least two to three hours long but this was a very short movie for an Indian film. The American actress that played in the movie is from the television hit series Heroes, Ali Larter. Her acting had a lot to be desired. However she did look good in the Indian dresses that she wore. All the movie had not a lot to be desired and I hope Salman does a lot better on his next movie. Thank you.",negative,positive
112,It's terrific when a funny movie doesn't make smile you. What a pity!! This film is very boring and so long. It's simply painfull. The story is staggering without goal and no fun.<br /><br />You feel better when it's finished.,negative,positive
139,"I caught this film on AZN on cable. It sounded like it would be a good film, a Japanese ""Green Card"". I can't say I've ever disliked an Asian film, quite the contrary. Some of the most incredible horror films of all time are Japanese and Korean, and I am a HUGE fan of John Woo's Hong Kong films. I an not adverse to a light hearted films, like Tampopo or Chung King Express (two of my favourites), so I thought I would like this. Well, I would rather slit my wrists and drink my own blood than watch this laborious, badly acted film ever again.<br /><br />I think the director Steven Okazaki must have spiked the water with Quaalude, because no one in this film had a personality. And when any of the characters DID try to act, as opposed to mumbling a line or two, their performance came across as forced and incredibly fake. I honestly did not think that anyone had ever acted before...the ONLY person who sounded genuine was Brenda Aoki.. I find it amazing that this is promoted as a comedy, because I didn't laugh once. Even MORE surprising is that CBS morning news called this ""a refreshing breath of comedy"". It was neither refreshing, nor a breath of comedy. And the ending was very predictable, the previous reviewer must be an idiot to think such things.<br /><br />AVOID this film unless you want to see a boring predictable plot line and wooden acting. I actually think that ""Spike of Bensonhurst"" is a better acted film than this...and I walked out half way through that film!",negative,positive


# We have not reduced the words in this Model to a bag of words