# Text Preprocessing

In [1]:
import pandas as pd 
import numpy as np

In [2]:
pd = pd.read_csv("imdb_movies.csv")

In [3]:
pd.shape

(10178, 12)

In [4]:
pd.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03-02-2023,73,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04-05-2023,76,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01-05-2023,70,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61,Action,<h>Good-hearted teenager William always lived ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [5]:
pd["overview"][4]

'<h>Good-hearted teenager William always lived in hope of following in his late father’s footsteps and becoming a storm chaser. His father’s legacy has now been turned into a storm-chasing tourist business, managed by the greedy and reckless Zane Rogers, who is now using William as the main attraction to lead a group of unsuspecting adventurers deep into the eye of the most dangerous supercell ever seen.</h>'

#### Lower Casing 

In [6]:
pd["overview"][3].lower()

'through a series of unfortunate events, three mummies end up in present-day london and embark on a wacky and hilarious journey in search of an old ring belonging to the royal family, stolen by ambitious archaeologist lord carnaby.'

#### All the values in lower case of overview 

In [7]:
pd["overview"] = pd["overview"].str.lower()

In [8]:
pd

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03-02-2023,73,"Drama, Action","after dominating the boxing world, adonis cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,2.716167e+08,AU
1,Avatar: The Way of Water,12/15/2022,78,"Science Fiction, Adventure, Action",set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2.316795e+09,AU
2,The Super Mario Bros. Movie,04-05-2023,76,"Animation, Adventure, Family, Fantasy, Comedy","while working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,7.244590e+08,AU
3,Mummies,01-05-2023,70,"Animation, Comedy, Family, Adventure, Fantasy","through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,3.420000e+07,AU
4,Supercell,03/17/2023,61,Action,<h>good-hearted teenager william always lived ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,3.409420e+08,US
...,...,...,...,...,...,...,...,...,...,...,...,...
10173,20th Century Women,12/28/2016,73,Drama,"in 1979 santa barbara, california, dorothea fi...","Annette Bening, Dorothea Fields, Lucas Jade Zu...",20th Century Women,Released,English,7000000.0,9.353729e+06,US
10174,Delta Force 2: The Colombian Connection,08/24/1990,54,Action,when dea agents are taken captive by a ruthles...,"Chuck Norris, Col. Scott McCoy, Billy Drago, R...",Delta Force 2: The Colombian Connection,Released,English,9145817.8,6.698361e+06,US
10175,The Russia House,12/21/1990,61,"Drama, Thriller, Romance","barley scott blair, a lisbon-based editor of r...","Sean Connery, Bartholomew 'Barley' Scott Blair...",The Russia House,Released,English,21800000.0,2.299799e+07,US
10176,Darkman II: The Return of Durant,07-11-1995,55,"Action, Adventure, Science Fiction, Thriller, ...",darkman and durant return and they hate each o...,"Larry Drake, Robert G. Durant, Arnold Vosloo, ...",Darkman II: The Return of Durant,Released,English,116000000.0,4.756613e+08,US


#### Remove the HTML Tags 

In [9]:
import re 
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)

In [10]:
text = "<h> good-hearted teenager william always lived in hope of following in his late father’s footsteps and becoming a storm chaser. his father’s legacy has now been turned into a storm-chasing tourist business, managed by the greedy and reckless zane rogers, who is now using william as the main attraction to lead a group of unsuspecting adventurers deep into the eye of the most dangerous supercell ever seen. </h>"

In [11]:
remove_html_tags(text)

' good-hearted teenager william always lived in hope of following in his late father’s footsteps and becoming a storm chaser. his father’s legacy has now been turned into a storm-chasing tourist business, managed by the greedy and reckless zane rogers, who is now using william as the main attraction to lead a group of unsuspecting adventurers deep into the eye of the most dangerous supercell ever seen. '

#### Apply on the dataset

In [12]:
pd["overview"].apply(remove_html_tags)

0        after dominating the boxing world, adonis cree...
1        set more than a decade after the events of the...
2        while working underground to fix a water main,...
3        through a series of unfortunate events, three ...
4        good-hearted teenager william always lived in ...
                               ...                        
10173    in 1979 santa barbara, california, dorothea fi...
10174    when dea agents are taken captive by a ruthles...
10175    barley scott blair, a lisbon-based editor of r...
10176    darkman and durant return and they hate each o...
10177    princess odette and prince derek are going to ...
Name: overview, Length: 10178, dtype: object

#### Remove URL's From Dataset

In [13]:
text1 = "This is the datset https://www.kaggle.com/datasets/ashpalsingh1525/imdb-movies-dataset"
text2 = "Hello sir this is me www.google.com"


In [14]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [15]:
remove_url(text2)

'Hello sir this is me '

#### Remove Puntuaction

In [16]:
import time , string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
exclude = string.punctuation

In [18]:
def remove_puntuation(text):
    for char in exclude:
        text = text.replace(char ," ")
    return text

In [19]:
text = "string. With. Puntuation?"

In [20]:
start = time.time()
print(remove_puntuation(text))
time1 = time.time()-start
print(time1*50000)

string  With  Puntuation 
0.0


In [21]:
def remove_punc(text):
    return text.translate(str.maketrans("","",exclude))


In [22]:
start = time.time()
print(remove_punc(text))
time2 = time.time() - start
print(time2*50000)

string With Puntuation
0.0


In [23]:
time1/time2

ZeroDivisionError: float division by zero

#### Chat Word Treatment
##### Did not find any dataset but the code is here 

In [24]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_word[w.upper()])
        else:
            new_text.append(w)
    return "".join(new_text)

#### Spelling Checker

In [25]:
from textblob import TextBlob

In [26]:
incorrect_box = "I went to teh libary to borow some bukks on histry and geogrophy."
textblob = TextBlob(incorrect_box)
textblob.correct().string

'I went to the library to brow some bunks on history and geography.'

#### Stop Words

In [27]:
import nltk
from nltk.corpus import stopwords
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


KeyboardInterrupt: 

In [None]:
stopwords.words('english')

#### Remove Emojis

In [None]:
import re 
def remove_emojis(text):
    emojis_pattern = re.compile("["
                                u"\U0001F600 - \U0001F64F"  # emotions
                                u"\U0001F300 - \U0001F5FF"  # Symbols & Pictographs
                                u"\U0001F680 - \U0001F6FF"  # transport & map symbol
                                u"\U0001F1E0 - \U0001F1FF"  # flags (ios)
                                u"\U00002720 - \U000027B0"
                                u"\U000024C2 - \U0001F251"
                                "]+", flags=re.UNICODE)
    return emojis_pattern.sub(r"",text)

In [None]:
text = "What does 😂 or ☺ mean? "

In [None]:
remove_emojis(text)

'Whatdoes😂or☺mean?'

In [None]:
import emoji
print(emoji.demojize("Python is 😂"))

Python is :face_with_tears_of_joy:


## Tokenization

In [28]:
# Word Tokenization
sent1 = " I am going to delhi"
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [None]:
# Sentence Tokenization 
sent2 = " I am going to delhi. I will stay there for 3 days. Let's hope the trip is good"
sent2.split(".")

[' I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip is good"]

: 

In [1]:
# Problem with the split function
sent1 = "I am going to delhi!"
sent1.split()

['I', 'am', 'going', 'to', 'delhi!']

#### To solve Problem using 
#### 1. Regular Expression
#### 2. NLTK
#### 3. Spacy

In [2]:
## Regular Expression 

import re
sent1 = "I am going to delhi!"
tokens = re.findall("[\w']+",sent1)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [4]:
## Using NLTK
from nltk import word_tokenize , sent_tokenize

In [5]:
sent1 = "I am going to delhi!"
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'delhi', '!']

#### Using Spacy

In [6]:
import spacy
nlp = spacy.load('en_code_web_sm')

OSError: [E050] Can't find model 'en_code_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
doc1 = nlp(sent1)
doc2 = nlp(sent2)

In [None]:
for token in  doc1:
    print(token)

## Steeming 

In [11]:
from nltk.stem.porter import PorterStemmer

In [13]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [14]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [15]:
text = "Today unfolds with fresh possibilities, weaving moments of calm and bursts of energy. Morning light brings clarity, midday delivers productivity, and evening invites reflection. Whether tackling tasks, connecting with others, or simply pausing to breathe, each instant holds potential. Cherish today’s rhythm—it’s a gift, unique and unrepeatable."
stem_words(text) 

'today unfold with fresh possibilities, weav moment of calm and burst of energy. morn light bring clarity, midday deliv productivity, and even invit reflection. whether tackl tasks, connect with others, or simpli paus to breathe, each instant hold potential. cherish today’ rhythm—it’ a gift, uniqu and unrepeatable.'

## Lemmatization

In [24]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
sentence = "He was runnning and eating at same time. He has bad habit of swimming after playing long hours in the sun."
punctuations = "?:!.,:"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

Word                lemma               


LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\aksha/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\aksha\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
