Sentiment Analysis on IMDB dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Loading

In [3]:
import os
import urllib.request
import zipfile
import tarfile
import sys

import pandas as pd
import numpy as np

In [4]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_path = "/content/aclImdb_v1.tar.gz"
extract_path = "/content/aclImdb"

# Download the dataset
print("Downloading IMDB dataset...")
urllib.request.urlretrieve(url, dataset_path)
print("Download complete.")

# Extract the dataset
print("Extracting Dataset")
with tarfile.open(dataset_path, 'r:gz') as tar:
    tar.extractall(extract_path)
print("Extraction Complete")

# Verify extraction
print("Dataset files:", os.listdir(extract_path))

Downloading IMDB dataset...
Download complete.
Extracting Dataset
Extraction Complete
Dataset files: ['aclImdb']


## Preprocessing

In [5]:
# Define the paths
base_path = "/content/aclImdb/aclImdb"

# Function to read reviews from a folder
def load_reviews(folder):
    reviews = []
    labels = []
    for label in ["pos", "neg"]:
        path = os.path.join(base_path, folder, label)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r", encoding="utf-8") as f:
                reviews.append(f.read())
                labels.append(label)
    return pd.DataFrame({"review": reviews, "label": labels})

# Load train and test datasets
train_df = load_reviews("train")
test_df = load_reviews("test")

# Combine for full analysis
df = pd.concat([train_df, test_df]).reset_index(drop=True)

# Convert labels to binary (pos = 1, neg = 0)
df["label"] = df["label"].map({"pos": 1, "neg": 0})

# Display first few rows
df.head()

Unnamed: 0,review,label
0,Gung Ho tries to express many ideas and entert...,1
1,This movie is simply wonderful! It's got it al...,1
2,If anyone is wondering why no one makes movies...,1
3,Recap: Zandalee is a young woman that feels mo...,1
4,<br /><br />In anticipation of Ang Lee's new m...,1


In [6]:
df.shape

(50000, 2)

In [25]:
train_df.shape

(25000, 2)

In [7]:
print(df.loc[4, 'review'][:])

<br /><br />In anticipation of Ang Lee's new movie "Crouching Tiger, Hidden Dragon," I saw this at blockbuster and figured I'd give it a try. A civil war movie is not the typical movie I watch. Luckily though, I had a good feeling about this director. This movie was wonderfully written. The dialogue is in the old southern style, yet doesn't sound cornily out of place and outdated. The spectacular acting helped that aspect of the movie. Toby Maguire was awesome. I thought he was good (but nothing special) in Pleasantville, but here he shines. I have always thought of Skeet Ulrich as a good actor (but nothing special), but here he is excellent as well. The big shocker for me was Jewel. She was amazingly good. Jeffrey Wright, who I had never heard of before, is also excellent in this movie. It seems to me that great acting and great writing and directing go hand in hand. A movie with bad writing makes the actors look bad and visa versa. This movie had the perfect combination. The actors l

In [8]:
import re

def preprocess(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+',' ', text.lower())+ ' '.join(emoticons).replace('-',''))
    return text

In [38]:
preprocess(df.loc[4, 'review'][:])

'in anticipation of ang lee s new movie crouching tiger hidden dragon i saw this at blockbuster and figured i d give it a try a civil war movie is not the typical movie i watch luckily though i had a good feeling about this director this movie was wonderfully written the dialogue is in the old southern style yet doesn t sound cornily out of place and outdated the spectacular acting helped that aspect of the movie toby maguire was awesome i thought he was good but nothing special in pleasantville but here he shines i have always thought of skeet ulrich as a good actor but nothing special but here he is excellent as well the big shocker for me was jewel she was amazingly good jeffrey wright who i had never heard of before is also excellent in this movie it seems to me that great acting and great writing and directing go hand in hand a movie with bad writing makes the actors look bad and visa versa this movie had the perfect combination the actors look brilliant and the character developm

In [49]:
preprocess("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [10]:
df['review'] = df['review'].apply(preprocess)

In [14]:
def tokenizer(text):
    return text.split()

sentence = 'big brown fox runs behind the running lazy ox'
tokenizer(sentence)

['big', 'brown', 'fox', 'runs', 'behind', 'the', 'running', 'lazy', 'ox']

In [15]:
import nltk
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [ porter.stem(word) for word in text.split()]

tokenizer_porter(sentence)

['big', 'brown', 'fox', 'run', 'behind', 'the', 'run', 'lazi', 'ox']

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter(sentence) if w not in stop]

['big', 'brown', 'fox', 'run', 'behind', 'run', 'lazi', 'ox']

## A logistic regression model for document classification

In [29]:
X_train = train_df.loc[:25000, 'review'].values
y_train = train_df.loc[:25000, 'label'].values

X_test = test_df.loc[:25000, 'review'].values
y_test = test_df.loc[:25000, 'label'].values

do not re run - its time consuming

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

small_param_grid = [
    {
        'vect__ngram_range': [(1,1)],
        'vect__stop_words': [None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    },
        {
        'vect__ngram_range': [(1,1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer],
        'vect__use_idf':[False],
        'vect__norm':[None],
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    }
]

lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(random_state=0, solver='liblinear'))
])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid=small_param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




In [26]:
print(f"Best paramete set: {gs_lr_tfidf.best_params_}")


Best paramete set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x798af4ecca40>}


In [27]:
print(f"CV accuracy: {gs_lr_tfidf.best_score_:.3f}")

CV accuracy: 0.892


In [30]:
clf = gs_lr_tfidf.best_estimator_
print(f"Test accuracy: {clf.score(X_test, y_test):.3f}")

Test accuracy: 0.884


## Baseline with Naive bayes

In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

nb_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', MultinomialNB())
])

nb_tfidf.fit(X_train, y_train)

In [36]:
nb_tfidf.score(X_test, y_test)

0.82956

In [40]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(nb_tfidf, df['review'], df['label'], cv=5, scoring="accuracy")


print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")

Cross-Validation Accuracy Scores: [0.8577 0.8557 0.8654 0.8595 0.8657]
Mean Accuracy: 0.8608


## Working with bigger data - online algorithms