In [25]:
from google.colab import drive
drive.mount('/content/drive')  # Firstly, mount my Google Drive. Mount means I will be connecting with my Google Drive. Should use the default mount point, otherwise it will get failed.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
%cd /content/drive/My Drive/Elon/GenAI/dataset
!pwd
!ls

/content/drive/My Drive/Elon/GenAI/dataset
/content/drive/My Drive/Elon/GenAI/dataset
'IMDB Dataset.csv'


In [31]:
!pwd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("GenAI").getOrCreate()
data_path = '/content/drive/MyDrive/Elon/GenAI/dataset/IMDB Dataset.csv'
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.show()  # Dataframe.display() method is not supported in PySpark. This method is available in Databricks notebooks, but not in regular pySpark sessions or Google colab.

df.count()  #50000

/content/drive/MyDrive/Elon/GenAI/dataset
+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
|"Probably my all-...| but that only ma...|
|I sure would like...|            positive|
|This show was an ...|            negative|
|Encouraged by the...|            negative|
|If you like origi...|            positive|
|"Phil the Alien i...|            negative|
|I saw this movie ...|            negative|
|"So im not a big ...| meaning most of ...|
|The cast played S...|            negative|
|This a fantastic ...|            positive|
|Kind of drawn in ...|            negative|
|Some films just s...|            positive|
|This movie made i...|            negative|
|I remember this f...|            

50000

Lower Case

In [32]:
## Data Preprocessing - Lower Case
from pyspark.sql.functions import lower
df_lower = df.withColumn("review", lower(df.review))
df_lower.show()

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|one of the other ...|            positive|
|"a wonderful litt...| not only is it w...|
|"i thought this w...| but spirited you...|
|basically there's...|            negative|
|"petter mattei's ...| power and succes...|
|"probably my all-...| but that only ma...|
|i sure would like...|            positive|
|this show was an ...|            negative|
|encouraged by the...|            negative|
|if you like origi...|            positive|
|"phil the alien i...|            negative|
|i saw this movie ...|            negative|
|"so im not a big ...| meaning most of ...|
|the cast played s...|            negative|
|this a fantastic ...|            positive|
|kind of drawn in ...|            negative|
|some films just s...|            positive|
|this movie made i...|            negative|
|i remember this f...|            positive|
|an awful film! it...|          

remove_html_tags

In [None]:
## Data preprocessing : remove html tags

In [33]:
import re           # re is Python's built-in Regular Expression module.
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# validate the udf using dummy text
text = "<html><body><p> Movie 1</p><p> Actor - Jack Cheng</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"
remove_html_tags(text)

remove_html_tags_udf = udf(remove_html_tags, StringType())
df_lower = df_lower.withColumn("review", remove_html_tags_udf(df_lower["review"]))

df_lower.take(8)[7]['review']

"this show was an amazing, fresh & innovative idea in the 70's when it first aired. the first 7 or 8 years were brilliant, but things dropped off after that. by 1990, the show was not really funny anymore, and it's continued its decline further to the complete waste of time it is today.it's truly disgraceful how far this show has fallen. the writing is painfully bad, the performances are almost as bad - if not for the mildly entertaining respite of the guest-hosts, this show probably wouldn't still be on the air. i find it so hard to believe that the same creator that hand-selected the original cast also chose the band of hacks that followed. how can one recognize such brilliance and then see fit to replace it with such mediocrity? i felt i must give 2 stars out of respect for the original cast that made this show such a huge success. as it is now, the show is just awful. i can't believe it's still on the air."

remove URL

In [None]:
## Data Preprocessing - remove URL
def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

## validate the udf using below dummy text
text1 = 'Check out my youtube https://www.youtube.com/'
text2 = 'Check out my linkedin https://www.linkedin.com/'
text3 = 'Google search here www.google.com'
text4 = 'For data click https://www.kaggle.com/'
remove_url(text1)

remove_url_udf = udf(remove_url, StringType())
df_lower = df_lower.withColumn("review", remove_url_udf(df_lower["review"]))

df_lower.take(8)[7]['review']

"this show was an amazing, fresh & innovative idea in the 70's when it first aired. the first 7 or 8 years were brilliant, but things dropped off after that. by 1990, the show was not really funny anymore, and it's continued its decline further to the complete waste of time it is today.it's truly disgraceful how far this show has fallen. the writing is painfully bad, the performances are almost as bad - if not for the mildly entertaining respite of the guest-hosts, this show probably wouldn't still be on the air. i find it so hard to believe that the same creator that hand-selected the original cast also chose the band of hacks that followed. how can one recognize such brilliance and then see fit to replace it with such mediocrity? i felt i must give 2 stars out of respect for the original cast that made this show such a huge success. as it is now, the show is just awful. i can't believe it's still on the air."

Punctuation handling

In [None]:
## Data Preprocessing - Punctuation handling
import string
pun = string.punctuation
def remove_pun(text):
    return text.translate(str.maketrans('', '', pun))

# validate the udf using below dummy text
text = 'string. With. Punctuation?'
remove_pun(text)

remove_pun_udf = udf(remove_pun, StringType())
df_lower = df_lower.withColumn("review", remove_pun_udf(df_lower["review"]))

df_lower.take(6)[5]['review']

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children'

Chat Conversation handling

In [None]:
## Data Preprocessing - chat conversation handling

chat_words = {
    "AFAIK":"As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP":"As Soon As Possible",
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It"
}

def chat_conversation(text):
    # Split the text into words and replace abbreviations if they exist in chat_words
    updated_words = [
        chat_words[word.upper()] if word.upper() in chat_words else word
        for word in text.split()
    ]
    # Join the words back into a single string
    return " ".join(updated_words)

text = 'Do this work ASAP'
chat_conversation(text)

'Do this work As Soon As Possible'

Incorect text handling

In [1]:
#Data Preprocessing - Incorrect text handling
from textblob import TextBlob

def correct_text(text):
    return str(TextBlob(text).correct())

# validate the udf using below dummy text
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'
textBlb = TextBlob(incorrect_text)
textBlb.correct().string



'certain conditions during several generations are modified in the same manner.'

Stop Words

In [2]:
# Data Preprocessing - Stop Words

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [34]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

stopwords_en = stopwords.words('english')

def remove_stopwords(text):
    filtered_words = [word for word in text.split() if word not in stopwords_en]
    return ' '.join(filtered_words)

remove_stopwords('This is a sample sentence with some stop words in English.')
remove_stopwords('probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times')

UDF_remove_stopwords = udf(remove_stopwords, StringType())
df_lower = df_lower.withColumn("review", UDF_remove_stopwords(df_lower["review"]))

df_lower.take(6)[5]['review']

'"probably all-time favorite movie, story selflessness, sacrifice dedication noble cause, preachy boring. never gets old, despite seen 15 times last 25 years. paul lukas\' performance brings tears eyes, bette davis, one truly sympathetic roles, delight. kids are, grandma says, like ""dressed-up midgets"" children'

Remove emoji

In [37]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was '

In [38]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/590.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/590.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [39]:
import emoji
print(emoji.demojize('Python is 🔥'))

Python is :fire:


# Tokenization

In [42]:
# word tokenization
sent1 = 'I am going to China'
sent1.split()

# sentence tokenization
sent2 = 'I am going to China. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to China',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

# Stemmer

In [43]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [44]:
text = 'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'
stem_words(text)

'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl caus but it not preachi or bore it just never get old despit my have seen it some 15 or more time in the last 25 year paul luka perform bring tear to my eye and bett davi in one of her veri few truli sympathet role is a delight the kid are as grandma say more like dressedup midget than children but that onli make them more fun to watch and the mother slow awaken to what happen in the world and under her own roof is believ and startl if i had a dozen thumb theyd all be up for thi movi'

# Lemmatization

## NOTE: Stemming & lamatization are same to retrieve root words but lamatization is worked good. Lamatization is slow & stemming is fast

In [48]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
