In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re


import os
print(os.listdir("../data"))
import warnings
warnings.filterwarnings('ignore')

['IMDB Dataset.csv']


In [62]:
df=pd.read_csv('../data/IMDB Dataset.csv')
print("Data shape: ", df.shape) # Total review
df

Data shape:  (50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [63]:
df['clean_review'] = df['review']
df['clean_review'].head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: clean_review, dtype: str

In [64]:
def clean_text(s: str) -> str:
    """
    Cleans text using regex.
    :param s: string
    :return: text
    """

    s = s.lower()
    s = re.sub('<.*?>',' ', s) # Remove HTML tags
    s = re.sub('[^a-zA-Z]', ' ', s) # Remove punctuation & number
    s = re.sub(r'\s+',' ', s) # Normalize whitespace
    return s.strip()

In [65]:
df["clean_review"] = df["clean_review"].apply(clean_text)
df["clean_review"].head()

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically there s a family where a little boy ...
4    petter mattei s love in the time of money is a...
Name: clean_review, dtype: str

In [66]:
X = df['clean_review']
y = df['sentiment']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y)

In [68]:
# Calculate TF-IDF

# TF (Term Frequency): Measures the frequency with which a word appears in a specific document. The more frequently a word appears, the higher its TF.

# IDF (Inverse Document Frequency): Measures the uniqueness of a word across the entire corpus. Words that appear in fewer documents have a higher IDF, indicating that the word carries more specific information.

tfidf_uni = TfidfVectorizer(
    ngram_range=(1,1),
    min_df=2,
    max_df=0.95,
    max_features=30000,
)

X_train_tfidf = tfidf_uni.fit_transform(X_train)
X_test_tfidf  = tfidf_uni.transform(X_test)

In [69]:
tfidf_uni.get_feature_names_out()[:20]

array(['aa', 'aaa', 'aag', 'aaliyah', 'aames', 'aamir', 'aardman',
       'aaron', 'ab', 'aback', 'abandon', 'abandoned', 'abandoning',
       'abandonment', 'abandons', 'abba', 'abbas', 'abbey', 'abbie',
       'abbot'], dtype=object)

In [70]:
tfidf_bi = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95,
    max_features=50000,
)

X_train_bi = tfidf_bi.fit_transform(X_train)
X_test_bi  = tfidf_bi.transform(X_test)

In [71]:
tfidf_bi.get_feature_names_out()[:20]

array(['aamir', 'aaron', 'abandon', 'abandoned', 'abandons', 'abbey',
       'abbot', 'abbott', 'abbott and', 'abby', 'abc', 'abducted',
       'abduction', 'abe', 'abel', 'abigail', 'abilities', 'ability',
       'ability and', 'ability of'], dtype=object)

In [72]:
len(tfidf_uni.vocabulary_)

30000

In [73]:
len(tfidf_bi.vocabulary_)

50000

Bi_gram have more feature than uni_gram

In [74]:
tfidf_nostop = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,1),
    max_features=50000,
)

X_train_nostop= tfidf_nostop.fit_transform(X_train)
X_test_nostop  = tfidf_nostop.transform(X_test)

In [75]:
tfidf_nostop.get_feature_names_out()[:20]

array(['aa', 'aaa', 'aaargh', 'aag', 'aage', 'aaja', 'aaker', 'aaliyah',
       'aames', 'aamir', 'aankhen', 'aapke', 'aardman', 'aargh', 'aaron',
       'aawip', 'ab', 'aback', 'abandon', 'abandoned'], dtype=object)

In [76]:
len(tfidf_nostop.vocabulary_)

50000

More useless words

## Day 3 – TF-IDF Summary

- Bigram increases representational capacity but will be evaluated in Day 4
- Bigram increases vocabulary size significantly
- Final decision on unigram vs bigram will be based on model performance
