In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Firstly, mount my Google Drive. Mount means I will be connecting with my Google Drive. Should use the default mount point, otherwise it will get failed.


In [12]:
%cd /content/drive/My Drive/Elon/GenAI/dataset   # relocating the folder where my data located
!pwd
!ls

/content/drive/My Drive/Elon/GenAI/dataset
/content/drive/My Drive/Elon/GenAI/dataset
'IMDB Dataset.csv'


In [26]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("GenAI").getOrCreate()
data_path = 'IMDB Dataset.csv'
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.show()  # Dataframe.display() method is not supported in PySpark. This method is available in Databricks notebooks, but not in regular pySpark sessions or Google colab.

df.count()  #50000

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
|"Probably my all-...| but that only ma...|
|I sure would like...|            positive|
|This show was an ...|            negative|
|Encouraged by the...|            negative|
|If you like origi...|            positive|
|"Phil the Alien i...|            negative|
|I saw this movie ...|            negative|
|"So im not a big ...| meaning most of ...|
|The cast played S...|            negative|
|This a fantastic ...|            positive|
|Kind of drawn in ...|            negative|
|Some films just s...|            positive|
|This movie made i...|            negative|
|I remember this f...|            positive|
|An awful film! It...|          

50000

Lower Case

In [19]:
## Data Preprocessing - Lower Case
from pyspark.sql.functions import lower
df_lower = df.withColumn("review", lower(df.review))
df_lower.show()

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|one of the other ...|            positive|
|"a wonderful litt...| not only is it w...|
|"i thought this w...| but spirited you...|
|basically there's...|            negative|
|"petter mattei's ...| power and succes...|
|"probably my all-...| but that only ma...|
|i sure would like...|            positive|
|this show was an ...|            negative|
|encouraged by the...|            negative|
|if you like origi...|            positive|
|"phil the alien i...|            negative|
|i saw this movie ...|            negative|
|"so im not a big ...| meaning most of ...|
|the cast played s...|            negative|
|this a fantastic ...|            positive|
|kind of drawn in ...|            negative|
|some films just s...|            positive|
|this movie made i...|            negative|
|i remember this f...|            positive|
|an awful film! it...|          

remove_html_tags

In [None]:
## Data preprocessing : remove html tags

In [32]:
import re           # re is Python's built-in Regular Expression module.
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# validate the udf using dummy text
text = "<html><body><p> Movie 1</p><p> Actor - Jack Cheng</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"
remove_html_tags(text)

remove_html_tags_udf = udf(remove_html_tags, StringType())
df_lower = df_lower.withColumn("review", remove_html_tags_udf(df_lower["review"]))

df_lower.take(8)[7]['review']

"this show was an amazing, fresh & innovative idea in the 70's when it first aired. the first 7 or 8 years were brilliant, but things dropped off after that. by 1990, the show was not really funny anymore, and it's continued its decline further to the complete waste of time it is today.it's truly disgraceful how far this show has fallen. the writing is painfully bad, the performances are almost as bad - if not for the mildly entertaining respite of the guest-hosts, this show probably wouldn't still be on the air. i find it so hard to believe that the same creator that hand-selected the original cast also chose the band of hacks that followed. how can one recognize such brilliance and then see fit to replace it with such mediocrity? i felt i must give 2 stars out of respect for the original cast that made this show such a huge success. as it is now, the show is just awful. i can't believe it's still on the air."

remove URL

In [37]:
## Data Preprocessing - remove URL
def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

## validate the udf using below dummy text
text1 = 'Check out my youtube https://www.youtube.com/'
text2 = 'Check out my linkedin https://www.linkedin.com/'
text3 = 'Google search here www.google.com'
text4 = 'For data click https://www.kaggle.com/'
remove_url(text1)

remove_url_udf = udf(remove_url, StringType())
df_lower = df_lower.withColumn("review", remove_url_udf(df_lower["review"]))

df_lower.take(8)[7]['review']

"this show was an amazing, fresh & innovative idea in the 70's when it first aired. the first 7 or 8 years were brilliant, but things dropped off after that. by 1990, the show was not really funny anymore, and it's continued its decline further to the complete waste of time it is today.it's truly disgraceful how far this show has fallen. the writing is painfully bad, the performances are almost as bad - if not for the mildly entertaining respite of the guest-hosts, this show probably wouldn't still be on the air. i find it so hard to believe that the same creator that hand-selected the original cast also chose the band of hacks that followed. how can one recognize such brilliance and then see fit to replace it with such mediocrity? i felt i must give 2 stars out of respect for the original cast that made this show such a huge success. as it is now, the show is just awful. i can't believe it's still on the air."

Punctuation handling

In [44]:
## Data Preprocessing - Punctuation handling
import string
pun = string.punctuation
def remove_pun(text):
    return text.translate(str.maketrans('', '', pun))

# validate the udf using below dummy text
text = 'string. With. Punctuation?'
remove_pun(text)

remove_pun_udf = udf(remove_pun, StringType())
df_lower = df_lower.withColumn("review", remove_pun_udf(df_lower["review"]))

df_lower.take(6)[5]['review']

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children'

Chat Conversation handling

In [46]:
## Data Preprocessing - chat conversation handling

chat_words = {
    "AFAIK":"As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP":"As Soon As Possible",
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It"
}

def chat_conversation(text):
    # Split the text into words and replace abbreviations if they exist in chat_words
    updated_words = [
        chat_words[word.upper()] if word.upper() in chat_words else word
        for word in text.split()
    ]
    # Join the words back into a single string
    return " ".join(updated_words)

text = 'Do this work ASAP'
chat_conversation(text)

'Do this work As Soon As Possible'