Use the following dataset - https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
# Problem 1

# Apply all the preprocessing techniques that you think are necessary

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("/content/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

1. Lowercasing

In [5]:
df["review"] = df["review"].str.lower()
df["review"]

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. <br /><br />the...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."
...,...
49995,i thought this movie did a down right good job...
49996,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,i am a catholic taught in parochial elementary...
49998,i'm going to have to disagree with the previou...


2. Removing html tags

In [6]:
from typing import Pattern
import re
def remove_html_tags(text):
  Pattern = re.compile("<.*?>")
  return Pattern.sub(r"", text)

In [8]:
df["review"] = df["review"].apply(remove_html_tags)
df["review"]

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."
...,...
49995,i thought this movie did a down right good job...
49996,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,i am a catholic taught in parochial elementary...
49998,i'm going to have to disagree with the previou...


Removing url

In [9]:
def remove_url(text):
  Pattern = re.compile(r"https?://\S+|www\.\S+")
  return Pattern.sub(r"", text)

In [10]:
df["review"] = df["review"].apply(remove_url)
df["review"]

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."
...,...
49995,i thought this movie did a down right good job...
49996,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,i am a catholic taught in parochial elementary...
49998,i'm going to have to disagree with the previou...


Removing Punctuation

In [13]:
import string
exclude = string.punctuation
def remove_punc(text):
  return text.translate(str.maketrans("", "", exclude))

In [14]:
df["review"] = df["review"].apply(remove_punc)
df["review"]

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...
3,basically theres a family where a little boy j...
4,petter matteis love in the time of money is a ...
...,...
49995,i thought this movie did a down right good job...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,i am a catholic taught in parochial elementary...
49998,im going to have to disagree with the previous...


chat word removing

In [16]:
chat_words = {
    "afk": "away from keyboard",
    "asap": "as soon as possible",
    "brb": "be right back",
    "btw": "by the way",
    "diy": "do it yourself",
    "eta": "estimated time of arrival",
    "fyi": "for your information",
    "idk": "i don’t know",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "jk": "just kidding",
    "lol": "laugh out loud",
    "np": "no problem",
    "omg": "oh my god",
    "tba": "to be announced",
    "tbc": "to be confirmed",
    "tbd": "to be determined",
    "ttyl": "talk to you later",
    "yolo": "you only live once",

    "ama": "ask me anything",
    "bff": "best friends forever",
    "dm": "direct message",
    "fomo": "fear of missing out",
    "icymi": "in case you missed it",
    "lmk": "let me know",
    "nsfw": "not safe for work",
    "ootd": "outfit of the day",
    "tbh": "to be honest",
    "til": "today i learned",
    "wcw": "woman crush wednesday",
    "mcm": "man crush monday",
    "wfh": "work from home",

    "fps": "first-person shooter",
    "gg": "good game",
    "glhf": "good luck, have fun",
    "hp": "hit points or health points",
    "npc": "non-player character",
    "op": "overpowered",
    "pvp": "player vs. player",
    "rpg": "role-playing game",
    "xp": "experience points",

    "bae": "before anyone else",
    "fam": "family or close friends",
    "flex": "to show off",
    "goat": "greatest of all time",
    "hbu": "how about you?",
    "irl": "in real life",
    "lmao": "laughing my *** off",
    "nvm": "never mind",
    "smh": "shaking my head",
    "tfw": "that feeling when",
    "tl;dr": "too long; didn’t read",
    "wyd": "what you doing?",

    "dns": "domain name system",
    "html": "hypertext markup language",
    "ip": "internet protocol",
    "isp": "internet service provider",
    "os": "operating system",
    "vpn": "virtual private network",
    "url": "uniform resource locator"
}


In [17]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [18]:
df["review"] = df["review"].apply(chat_conversion)
df["review"]

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...
3,basically theres a family where a little boy j...
4,petter matteis love in the time of money is a ...
...,...
49995,i thought this movie did a down right good job...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,i am a catholic taught in parochial elementary...
49998,im going to have to disagree with the previous...


# Spelling correctness

In [19]:
from textblob import TextBlob
def correct_spelling(text):
    return str(TextBlob(text).correct())

In [21]:
df["review"] = df["review"].apply(correct_spelling)
df["review"]

KeyboardInterrupt: 

Removing stopwords

In [27]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [33]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [34]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text)  # Tokenize the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Filter out stopwords
    return ' '.join(filtered_words)

In [35]:
df["review"] = df["review"].apply(remove_stopwords)
df["review"]

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
49995,thought movie right good job wasnt creative or...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary schools n...
49998,im going disagree previous comment side maltin...


Tokenization

In [36]:
from nltk.tokenize import word_tokenize
df['review'] = df['review'].apply(word_tokenize)


In [37]:
df["review"]

Unnamed: 0,review
0,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,"[wonderful, little, production, filming, techn..."
2,"[thought, wonderful, way, spend, time, hot, su..."
3,"[basically, theres, family, little, boy, jake,..."
4,"[petter, matteis, love, time, money, visually,..."
...,...
49995,"[thought, movie, right, good, job, wasnt, crea..."
49996,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,"[catholic, taught, parochial, elementary, scho..."
49998,"[im, going, disagree, previous, comment, side,..."


stemming or Lemmatization

In [43]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_review(review):
    # Check if review is already tokenized (a list)
    if isinstance(review, list):
        words = review
    else:
        # Tokenize the review into words if it's a string
        from nltk.tokenize import word_tokenize # import inside the function to avoid conflicts
        words = word_tokenize(review)

    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in words]
    # Join the stemmed words back into a single string
    return ' '.join(stemmed_words)


In [44]:
df["review"] = df["review"].apply(stem_review)
df["review"]

Unnamed: 0,review
0,one review mention watch 1 oz episod youll hoo...
1,wonder littl product film techniqu unassum old...
2,thought wonder way spend time hot summer weeke...
3,basic there famili littl boy jake think there ...
4,petter mattei love time money visual stun film...
...,...
49995,thought movi right good job wasnt creativ orig...
49996,bad plot bad dialogu bad act idiot direct anno...
49997,cathol taught parochi elementari school nun ta...
49998,im go disagre previou comment side maltin one ...


 Lemmatization

In [45]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_review(review):
    # Tokenize the review into words
    words = word_tokenize(review)
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Join the lemmatized words back into a single string
    return ' '.join(lemmatized_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [46]:
df['review'] = df['review'].apply(lemmatize_review)


Handling Negations(optional)
Removing Numbers (Optional)

Removing Extrawhite space

In [47]:
df['review'] = df['review'].str.strip()

# Combine code of all the above

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk

# Download necessary NLTK datasets
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset
df = pd.DataFrame({"review": ["I loved this movie! Absolutely fantastic.", "Terrible movie. I wouldn’t recommend it to anyone."]})

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords and apply lemmatization
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the review column
df['review'] = df['review'].apply(preprocess_text)

print(df)


In [None]:
# Problem 2

# Find out the number of words in the entire corpus and also the total number of unique words(vocabulary) using just python

To find the total number of words in the entire corpus and the total number of unique words (vocabulary) from the DataFrame containing movie reviews, you can follow these steps:

1.Combine all the reviews into a single string.
2.Split the string into individual words.
3 Count the total number of words and the number of unique words.

In [48]:
# Combine all reviews into a single string
all_reviews = ' '.join(df['review'])

# Tokenize the combined reviews
words = word_tokenize(all_reviews)

# Calculate the total number of words
total_words = len(words)

# Calculate the total number of unique words (vocabulary)
unique_words = set(words)
total_unique_words = len(unique_words)

# Display the results
print(f"Total number of words in the corpus: {total_words}")
print(f"Total number of unique words (vocabulary): {total_unique_words}")

Total number of words in the corpus: 5991900
Total number of unique words (vocabulary): 181959


In [None]:
# Problem 3

# Apply One Hot Encoding

In [49]:
# Apply One Hot Encoding on the 'review' column
df_encoded = pd.get_dummies(df['review'], prefix='review')

# Combine the one-hot encoded columns back to the original DataFrame
df_combined = pd.concat([df, df_encoded], axis=1)

# Display the DataFrame after One Hot Encoding
print("\nDataFrame after One Hot Encoding:")
print(df_combined)


DataFrame after One Hot Encoding:
                                                  review sentiment  \
0      one review mention watch 1 oz episod youll hoo...  positive   
1      wonder littl product film techniqu unassum old...  positive   
2      thought wonder way spend time hot summer weeke...  positive   
3      basic there famili littl boy jake think there ...  negative   
4      petter mattei love time money visual stun film...  positive   
...                                                  ...       ...   
49995  thought movi right good job wasnt creativ orig...  positive   
49996  bad plot bad dialogu bad act idiot direct anno...  negative   
49997  cathol taught parochi elementari school nun ta...  negative   
49998  im go disagre previou comment side maltin one ...  negative   
49999  one expect star trek movi high art fan expect ...  negative   

       review_a turkish bath sequenc film noir locat new york 50 must hint someth someth curious previou comment one poi

In [None]:
# Problem 4

# Apply bag words and find the vocabulary also find the times each word has occured

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['review'])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
vocabulary = vectorizer.vocabulary_  # This gives a dict of words and their indices
word_counts = bow_df.sum(axis=0)  # Sum occurrences for each word

# Display the Bag of Words DataFrame
print("Bag of Words DataFrame:")
print(bow_df)

# Display vocabulary and word occurrences
print("\nVocabulary:")
print(vocabulary)

print("\nWord Occurrences:")
print(word_counts)

In [None]:
# Problem 5

# Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

In [None]:
# Problem 6

# Apply tf-idf and find out the idf scores of words, also find out the vocabulary.