In [1]:
import pandas as pd
df=pd.read_csv("IMDB Dataset.csv")

In [2]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

In [4]:
df=df.head(100)

In [5]:
df.shape

(100, 2)

# Converting texts into lowercase to treat all as same entity:

In [7]:
df["review"]=df["review"].str.lower()
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
95,daniel day-lewis is the most versatile actor a...,positive
96,my guess would be this was originally going to...,negative
97,"well, i like to watch bad horror b-movies, cau...",negative
98,"this is the worst movie i have ever seen, as w...",negative


In [8]:
df["review"][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

# Removal & Replacing:

In [14]:
import re                                                   #import regular expression
def remove_html_tags(text):
    pattern = re.compile('<.*?>')                           #creates a regex pattern object for reuse.
    return pattern.sub(r'', text)                           #sub is about substitute and r is to treat string as raw not to treat as escape character\.

In [15]:
sample_tags = "<html><body><p>My name is Chetan N Revankar</p></body></html>"
remove_html_tags(sample)

'My name is Chetan N Revankar'

In [16]:
df['review']=df['review'].apply(remove_html_tags)

In [18]:
df['review'][9]

'if you like original gut wrenching laughter you will like this movie. if you are young or old then you will love this movie, hell even my mom liked it.great camp!!!'

## Removal of URL:

In [29]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+')                 # https?:// matches http:// or https:// links \S+ Matches one or more non space characters
    return pattern.sub(r'', text)

sample_url = "My LinkedIN:https://www.linkedin.com/in/chetannrevankar"
remove_url(sample_url)


'My LinkedIN:'

## Punctuation Handling:

In [33]:
import string, time
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [35]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

sample_pt = "My name is $Chetan N Revankar"
remove_punc(sample_pt)

'My name is Chetan N Revankar'

In [41]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))       #Fastest method to replace

remove_punc1(sample_pt)

'My name is Chetan N Revankar'

In [44]:
df['review'] = remove_punc(df['review'])
df['review']

0     one of the other reviewers has mentioned that ...
1     a wonderful little production the filming tech...
2     i thought this was a wonderful way to spend ti...
3     basically theres a family where a little boy j...
4     petter matteis love in the time of money is a ...
                            ...                        
95    daniel daylewis is the most versatile actor al...
96    my guess would be this was originally going to...
97    well i like to watch bad horror bmovies cause ...
98    this is the worst movie i have ever seen as we...
99    i have been a mario fan for as long as i can r...
Name: review, Length: 100, dtype: object

## Chat words Conversion:

In [59]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "FYI": "For Your Information",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "IMHO": "In My Humble Opinion",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "ICYMI": "In Case You Missed It",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "GTG": "Got To Go",
    "TTYL": "Talk To You Later",
    "TTYT": "Talk To You Tomorrow",
    "LOL": "Laugh Out Loud",
    "TBH": "To Be Honest",
    "NGL": "Not Gonna Lie",
    "IRL": "In Real Life",
    "ETA": "Estimated Time of Arrival",
    "LMK": "Let Me Know",
    "FYR": "For Your Reference",
    "ROFL": "Rolling On The Floor Laughing",
    "LMAO": "Laughing My Ass Off",
    "TTYS": "Talk To You Soon",
    "SMH": "Shaking My Head",
    "IDC": "I Don't Care",
    "IIRC": "If I Recall Correctly",
    "JK": "Just Kidding",
    "BRB": "Be Right Back",
}

def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

chat_conversion("I'll be attending the interview tomorrow ASAP")

"I'll be attending the interview tomorrow As Soon As Possible"

## Spelling mistake handling:

In [78]:
from textblob import TextBlob                     #Library to handle the spelling mistake
incorrect_text = "I am Grduated frm RNSIT, in the banch of CSE."
textblb = TextBlob(incorrect_text)
textblb.correct().string

'I am Graduated from RNSIT, in the branch of CSE.'

## Stopwords:

In [81]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [83]:
len(stopwords.words('english'))

198

Stopwords are common filler words (like the, is, at, of, and) that don’t add much meaning. They are usually removed in NLP to focus on important words for analysis. It also increases the dimension of the dataset so if we remove it helps to reduce the computation.

In [84]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [85]:
remove_stopwords('My name is Chetan N Revankar, am going to be Data Engineer as soon as possible.')

'My name  Chetan N Revankar,  going   Data Engineer  soon  possible.'

In [86]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [87]:
df['review'] = df['review'].apply(remove_stopwords)

In [88]:
df['review']

0     one    reviewers  mentioned   watching  1 oz e...
1      wonderful little production  filming techniqu...
2      thought    wonderful way  spend time    hot s...
3     basically theres  family   little boy jake thi...
4     petter matteis love   time  money   visually s...
                            ...                        
95    daniel daylewis    versatile actor alive engli...
96     guess would    originally going    least two ...
97    well  like  watch bad horror bmovies cause  th...
98       worst movie   ever seen  well   worst    pr...
99        mario fan   long    remember    fond memor...
Name: review, Length: 100, dtype: object

## Remove Emoji's:

In [89]:
import re

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  # other symbols
                           u"\U000024C2-\U0001F251"  # enclosed characters
                           "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', text)   # replace emojis with empty string


In [91]:
remove_emoji("Am a data engineer 💻📊🚀")

'Am a data engineer '

In [92]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/590.6 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/590.6 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/590.6 kB ? eta -:--:--
   -------------------------------------- 590.6/590.6 kB 446.9 kB/s eta 0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.14.1



[notice] A new release of pip is available: 25.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [94]:
import emoji
print(emoji.demojize('Data speaks truth💡🗣️'))   #demojize used to know the meaning of the emoji

Data speaks truth:light_bulb::speaking_head:


# Tokenization:

### 1. Using the split function:

In [95]:
sentence = "I am Chetan N Revankar"
sentence.split()                        #word tokenization

['I', 'am', 'Chetan', 'N', 'Revankar']

In [96]:
sentence1 = "I am Chetan N Revankar. Future Data Engineer."
sentence1.split('.')                                              #sentence tokenization

['I am Chetan N Revankar', ' Future Data Engineer', '']

### 2. Using Regular Expression:

In [97]:
import re
sentence2 = "I am going to Dubai"
tokens = re.findall("[\w]+", sentence2)
tokens

  tokens = re.findall("[\w]+", sentence2)


['I', 'am', 'going', 'to', 'Dubai']

In [98]:
text = "Generative AI (GenAI) is a branch of artificial intelligence that can create new content such as text, images, code, or music by learning patterns from existing data. It uses advanced models like Large Language Models (LLMs) and Generative Adversarial Networks (GANs) to mimic human-like creativity. GenAI is widely applied in chatbots, content creation, design, and automation."
sentences = re.compile('[.!?] ').split(text)
sentences

['Generative AI (GenAI) is a branch of artificial intelligence that can create new content such as text, images, code, or music by learning patterns from existing data',
 'It uses advanced models like Large Language Models (LLMs) and Generative Adversarial Networks (GANs) to mimic human-like creativity',
 'GenAI is widely applied in chatbots, content creation, design, and automation.']

### 3. NLTK:

In [99]:
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

`word_tokenize` splits a sentence into individual words/tokens.

`sent_tokenize` splits a paragraph/text into sentences.

`Punkt` is a pre-trained, unsupervised tokenizer model in NLTK used for splitting text into sentences and words.

In [102]:
sent1 = "I am Chetan N Revankar. Future Data Engineer."
word_tokenize(sent1)

['I', 'am', 'Chetan', 'N', 'Revankar', '.', 'Future', 'Data', 'Engineer', '.']

In [103]:
sent_tokenize(sent1)

['I am Chetan N Revankar.', 'Future Data Engineer.']

### 4. Spacy:

In [104]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [105]:
doc1 = nlp(sent1)
doc1

I am Chetan N Revankar. Future Data Engineer.

In [106]:
for token in doc1:
    print(token)

I
am
Chetan
N
Revankar
.
Future
Data
Engineer
.


# Stemmer:
Stemming is about reducing words to their root form (often crude), mainly to normalize text for NLP tasks like search or classification.

In [1]:
from nltk.stem.porter import PorterStemmer

`nltk` Python library for NLP tasks like tokenization, stemming, lemmatization, stopword removal.

`stem` Submodule in NLTK with stemming algorithms (reduce words to root).

`porter` Porter algorithm removes common endings (-ing, -ed, -s, -ly).

`PorterStemmer` Class in NLTK that applies Porter algorithm for stemming.

In [3]:
from nltk.stem.porter import PorterStemmer  

ps = PorterStemmer()  

def stem_words(text):
    words = text.split()                              # Step 1: split text into words
    stemmed_words = [ps.stem(w) for w in words]       # Step 2: stem each word
    return " ".join(stemmed_words)                    # Step 3: join back into sentence

print(stem_words("running runner runs easily"))

run runner run easili


In [4]:
text = "My name is Chetan N Revankar from Bangalore, Rajajinagar. I recently completed my Bachelor's Degree from RNSIT with an aggregate of 8.23CGPA."
stem_words(text)

"my name is chetan n revankar from bangalore, rajajinagar. i recent complet my bachelor' degre from rnsit with an aggreg of 8.23cgpa."

# Lemmatization:
Process of reducing words to their base or dictionary form (lemma) using vocabulary and grammar rules, ensuring meaningful words.

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')     # WordNet database for lemmatization
nltk.download('omw-1.4')     # additional language support

wordnet_lemmatizer = WordNetLemmatizer()

text = "My name is Chetan N Revankar from Bangalore, Rajajinagar. I recently completed my Bachelor's Degree from RNSIT with an aggregate of 8.23CGPA."

punctuations = "?:!.,;"

# Step 1: Tokenize text (split into words + punctuation)
sentence_words = nltk.word_tokenize(text)

# Step 2: Remove punctuation
filtered_words = [word for word in sentence_words if word not in punctuations]

# Step 3: Print original word and its lemmatized form (as verb)
print("{0:20}{1:20}".format("Word","Lemma"))
for word in filtered_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Word                Lemma               
My                  My                  
name                name                
is                  be                  
Chetan              Chetan              
N                   N                   
Revankar            Revankar            
from                from                
Bangalore           Bangalore           
Rajajinagar         Rajajinagar         
I                   I                   
recently            recently            
completed           complete            
my                  my                  
Bachelor            Bachelor            
's                  's                  
Degree              Degree              
from                from                
RNSIT               RNSIT               
with                with                
an                  an                  
aggregate           aggregate           
of                  of                  
8.23CGPA            8.23CGPA            
