In [20]:
import pandas as pd
import os
import ast
import re

In [21]:
raw_data_path = os.path.join(os.getcwd(),"dataset","raw_quotes_data.csv")
data = pd.read_csv(raw_data_path)

In [22]:
data.head()

Unnamed: 0,quote,author,tags
0,“Be yourself; everyone else is already taken.”,Oscar Wilde,"['be-yourself', 'gilbert-perreira', 'honesty',..."
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe,"['best', 'life', 'love', 'mistakes', 'out-of-c..."
2,“Two things are infinite: the universe and hum...,Albert Einstein,"['human-nature', 'humor', 'infinity', 'philoso..."
3,"“So many books, so little time.”",Frank Zappa,"['books', 'humor']"
4,“A room without books is like a body without a...,Marcus Tullius Cicero,"['books', 'simile', 'soul']"


In [23]:
data.shape

(2508, 3)

In [24]:
len(data["author"].unique())

880

* We have quotes of 880 different authors

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2508 entries, 0 to 2507
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   quote   2508 non-null   object
 1   author  2508 non-null   object
 2   tags    2508 non-null   object
dtypes: object(3)
memory usage: 58.9+ KB


In [26]:
data[data.duplicated()]

Unnamed: 0,quote,author,tags


* No duplicate values in dataset

In [27]:
data.isna().sum()

quote     0
author    0
tags      0
dtype: int64

* No missing values

In [28]:
for i in data["tags"]:
    print(type(i))
    break

<class 'str'>


In [29]:
data["tags"] = data["tags"].apply(ast.literal_eval)

In [30]:
for i in data["tags"]:
    print(type(i))
    break

<class 'list'>


In [31]:
for i in data["tags"]:
    if len(i) == 0:
        print(i)

[]
[]
[]
[]
[]
[]
[]
[]
[]


* We have 9 quotes with no tags

In [32]:
## Some Preprocessing to avoid any unknown errors

data['quote'] = data['quote'].fillna('')
data['author'] = data['author'].fillna('Unknown')
data['tags'] = data['tags'].apply(lambda x: x if isinstance(x, list) else [])
data['quote'] = data['quote'].str.lower().str.strip()
data['author'] = data['author'].str.lower().str.strip()
data = data.drop_duplicates(subset=['quote', 'author'])

In [33]:
data[data["quote"] == ""]

Unnamed: 0,quote,author,tags


In [34]:
data[data["author"] == "Unknown"]

Unnamed: 0,quote,author,tags


In [35]:
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text).strip()

In [36]:
def clean_tags(tags_str):
    try:
        tags_list = ast.literal_eval(tags_str)
        cleaned_tags = [re.sub(r'[^a-zA-Z0-9\s]', '', tag).strip() for tag in tags_list]
        return [tag for tag in cleaned_tags if tag]  
    except:
        return []

In [37]:
data['quote'] = data['quote'].astype(str).apply(clean_text)
data['author'] = data['author'].astype(str).apply(clean_text)
data['tags'] = data['tags'].astype(str).apply(clean_tags)

In [39]:
data

Unnamed: 0,quote,author,tags
0,be yourself everyone else is already taken,oscar wilde,"[beyourself, gilbertperreira, honesty, inspira..."
1,im selfish impatient and a little insecure i m...,marilyn monroe,"[best, life, love, mistakes, outofcontrol, tru..."
2,two things are infinite the universe and human...,albert einstein,"[humannature, humor, infinity, philosophy, sci..."
3,so many books so little time,frank zappa,"[books, humor]"
4,a room without books is like a body without a ...,marcus tullius cicero,"[books, simile, soul]"
...,...,...,...
2503,morality is simply the attitude we adopt towar...,oscar wilde,"[morality, philosophy]"
2504,dont aim at success the more you aim at it and...,viktor e frankl,"[happiness, success]"
2505,in life finding a voice is speaking and living...,john grisham,[inspirationallife]
2506,winter is the time for comfort for good food a...,edith sitwell,"[comfort, home, winter]"


In [38]:
processed_dataset_path = os.path.join(os.getcwd(),'dataset','processed_quotes_data_1.csv')
data.to_csv(processed_dataset_path, index=False)