In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("C:\\Users\\ssc\\Documents\\audible_uncleaned.csv")

In [3]:
df.head(2)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0


## A. Structural & schema tasks

In [4]:
# 1.	Which columns have the wrong data type (e.g., time and releasedate as strings) 
#       and which should be converted?
df.dtypes

name           object
author         object
narrator       object
time           object
releasedate    object
language       object
stars          object
price          object
dtype: object

In [5]:
# 2.	Are there any rows or columns that are entirely empty or nearly empty — 
#       should any be dropped?
df.isnull().sum()

name           0
author         0
narrator       0
time           0
releasedate    0
language       0
stars          0
price          0
dtype: int64

In [6]:
df.rename(columns={
    'releasedate': 'release_date',
    'stars': 'rating_text'
}).head(2)

Unnamed: 0,name,author,narrator,time,release_date,language,rating_text,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0


## B. Parsing / splitting fields

In [7]:
import re

In [8]:
#4.	How can you remove the prefixes Writtenby: 
#   and Narratedby: from author and narrator and store only the names?
df['author'] = df['author'].str.replace('Writtenby:', '', regex=False)
df['narrator'] = df['narrator'].str.replace('Narratedby:','', regex=False)
df.head(2)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,GeronimoStilton,BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,RickRiordan,RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0


In [9]:
#5.	Can you split name into parts
#(e.g., series name vs. book numbers, or title vs subtitle) where appropriate?
df['author'] = df['author'].str.replace(r'(?=[A-Z])', ' ', regex=True)
df['narrator'] = df['narrator'].str.replace(r'(?=[A-Z])', ' ', regex=True)
df.head(2)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Rick Riordan,Robbie Daymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0


In [12]:
#6.	How can you parse time into consistent numeric fields 
#like duration_minutes (or duration_hours + duration_minutes)?
df['time'] = df['time'].str.lower().str.replace(' and ', ':')
df['time'] = df['time'].str.replace(r'\b(hrs?|hours?|mins?|minutes?)\b', '', regex=True).str.replace(r'\s+', '', regex=True)
df['time'] = pd.to_datetime( df['time'], format ="%H:%M", errors="coerce").dt.time
df

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,02:20:00,04-08-08,English,5 out of 5 stars34 ratings,468.00
1,The Burning Maze,Rick Riordan,Robbie Daymond,13:08:00,01-05-18,English,4.5 out of 5 stars41 ratings,820.00
2,The Deep End,Jeff Kinney,Dan Russell,02:03:00,06-11-20,English,4.5 out of 5 stars38 ratings,410.00
3,Daughter of the Deep,Rick Riordan,Soneela Nankani,11:16:00,05-10-21,English,4.5 out of 5 stars12 ratings,615.00
4,"The Lightning Thief: Percy Jackson, Book 1",Rick Riordan,Jesse Bernstein,NaT,13-01-10,English,4.5 out of 5 stars181 ratings,820.00
...,...,...,...,...,...,...,...,...
87484,Last Days of the Bus Club,Chris Stewart,Chris Stewart,07:34:00,09-03-17,English,Not rated yet,596.00
87485,The Alps,Stephen O' Shea,Robert Fass,10:07:00,21-02-17,English,Not rated yet,820.00
87486,The Innocents Abroad,Mark Twain,Flo Gibson,19:04:00,30-12-16,English,Not rated yet,938.00
87487,A Sentimental Journey,Laurence Sterne,Anton Lesser,04:08:00,23-02-11,English,Not rated yet,680.00


In [14]:
# 7.	How can you parse stars to extract a numeric rating (e.g., 4.5) and the number of ratings (e.g., 34)?
#       What about when spacing or words are missing?

# Extract rating (e.g., 4.5 or 3.0)
df['rating'] = df['stars'].str.extract(r'([\d\.]+) out of')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')  # convert safely to float

# Extract number of ratings (digits after 'stars')
df['num_ratings'] = df['stars'].str.extract(r'stars(\d+) ratings')
df['num_ratings'] = pd.to_numeric(df['num_ratings'], errors='coerce')  # convert safely to int

# Replace NaN with 0 for unrated products
df['Rating'] = df['rating'].fillna(0)
df['num_ratings'] = df['num_ratings'].fillna(0).astype(int)


df.head(2)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings,Rating
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,02:20:00,04-08-08,English,5 out of 5 stars34 ratings,468.0,5.0,34,5.0
1,The Burning Maze,Rick Riordan,Robbie Daymond,13:08:00,01-05-18,English,4.5 out of 5 stars41 ratings,820.0,4.5,41,4.5


## C. Dates & timezones

In [17]:
#8.	Are releasedate values in a consistent format (mm/dd/yyyy vs dd/mm/yyyy)? How can you programmatically detect the format 
#and convert to datetime64[ns]?

df['releasedate'] = pd.to_datetime(df['releasedate'], dayfirst=True, errors='coerce')
df.head(5)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings,Rating
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,02:20:00,2008-08-04,English,5 out of 5 stars34 ratings,468.0,5.0,34,5.0
1,The Burning Maze,Rick Riordan,Robbie Daymond,13:08:00,2018-05-01,English,4.5 out of 5 stars41 ratings,820.0,4.5,41,4.5
2,The Deep End,Jeff Kinney,Dan Russell,02:03:00,2020-11-06,English,4.5 out of 5 stars38 ratings,410.0,4.5,38,4.5
3,Daughter of the Deep,Rick Riordan,Soneela Nankani,11:16:00,2021-10-05,English,4.5 out of 5 stars12 ratings,615.0,4.5,12,4.5
4,"The Lightning Thief: Percy Jackson, Book 1",Rick Riordan,Jesse Bernstein,NaT,2010-01-13,English,4.5 out of 5 stars181 ratings,820.0,4.5,181,4.5


In [18]:
# 9.	After converting, can you extract useful features like release_year, release_month, release_day, or release_weekday?
# Extract features
df['release_year']   = df['releasedate'].dt.year
df['release_month']  = df['releasedate'].dt.month
df['release_day']    = df['releasedate'].dt.day
df['release_weekday'] = df['releasedate'].dt.day_name()   # full weekday name
df['release_week']   = df['releasedate'].dt.isocalendar().week

df.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings,Rating,release_year,release_month,release_day,release_weekday,release_week
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,02:20:00,2008-08-04,English,5 out of 5 stars34 ratings,468.0,5.0,34,5.0,2008,8,4,Monday,32
1,The Burning Maze,Rick Riordan,Robbie Daymond,13:08:00,2018-05-01,English,4.5 out of 5 stars41 ratings,820.0,4.5,41,4.5,2018,5,1,Tuesday,18
2,The Deep End,Jeff Kinney,Dan Russell,02:03:00,2020-11-06,English,4.5 out of 5 stars38 ratings,410.0,4.5,38,4.5,2020,11,6,Friday,45
3,Daughter of the Deep,Rick Riordan,Soneela Nankani,11:16:00,2021-10-05,English,4.5 out of 5 stars12 ratings,615.0,4.5,12,4.5,2021,10,5,Tuesday,40
4,"The Lightning Thief: Percy Jackson, Book 1",Rick Riordan,Jesse Bernstein,NaT,2010-01-13,English,4.5 out of 5 stars181 ratings,820.0,4.5,181,4.5,2010,1,13,Wednesday,2


In [23]:
# 10.Are there impossible or invalid dates (e.g., month > 12 if format misinterpreted)? How will you detect and fix them?
invalid_rows = df[df['releasedate'].isna()]
print("Invalid count:", len(invalid_rows))
print(invalid_rows)
#If they existed, we would detect them with isna() after pd.to_datetime conversion, and either drop or replace them depending on business needs.


Invalid count: 0
Empty DataFrame
Columns: [name, author, narrator, time, releasedate, language, stars, price, rating, num_ratings, Rating, release_year, release_month, release_day, release_weekday, release_week]
Index: []


## D. Missing values and special null indicators

In [25]:
import numpy as np

In [27]:
#11.	Which values should be considered "missing" (e.g., "", "NA", "None", "-", "n/a") and how will you standardize them to np.nan?
df = df.replace(r'^\s*$|^(NA|None|-|n/a)$', np.nan, regex=True)
df.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings,Rating,release_year,release_month,release_day,release_weekday,release_week
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,02:20:00,2008-08-04,English,5 out of 5 stars34 ratings,468.0,5.0,34,5.0,2008,8,4,Monday,32
1,The Burning Maze,Rick Riordan,Robbie Daymond,13:08:00,2018-05-01,English,4.5 out of 5 stars41 ratings,820.0,4.5,41,4.5,2018,5,1,Tuesday,18
2,The Deep End,Jeff Kinney,Dan Russell,02:03:00,2020-11-06,English,4.5 out of 5 stars38 ratings,410.0,4.5,38,4.5,2020,11,6,Friday,45
3,Daughter of the Deep,Rick Riordan,Soneela Nankani,11:16:00,2021-10-05,English,4.5 out of 5 stars12 ratings,615.0,4.5,12,4.5,2021,10,5,Tuesday,40
4,"The Lightning Thief: Percy Jackson, Book 1",Rick Riordan,Jesse Bernstein,NaT,2010-01-13,English,4.5 out of 5 stars181 ratings,820.0,4.5,181,4.5,2010,1,13,Wednesday,2


In [30]:
# 12.	For each column, how many missing values exist and are there patterns (missing by author, by year, etc.)?
print("Total no of rows:",df.shape)
print("Null rows in each column:")
df.isna().sum()


Total no of rows: (87489, 16)
Null rows in each column:


name                   0
author                 0
narrator               0
time               16098
releasedate            0
language               0
stars                  0
price                  0
rating             72417
num_ratings            0
Rating                 0
release_year           0
release_month          0
release_day            0
release_weekday        0
release_week           0
dtype: int64

In [33]:
# 13.	For columns with missing values, which imputation strategy is appropriate (leave as missing, fill with mode/median, forward/backfill,
# or domain default)?

#  for numeric values median and for objects take mode
df = df.fillna({col: df[col].median() if df[col].dtype != 'object' else df[col].mode().iloc[0] for col in df.columns})
df.sample(10)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings,Rating,release_year,release_month,release_day,release_weekday,release_week
51323,The Complete Fiction of H.P. Lovecraft,H. P. Lovecraft,"Andrew Leman, Sean Branney",01:02:00,2019-04-25,English,4.5 out of 5 stars6 ratings,1172.0,4.5,6,4.5,2019,4,25,Thursday,17
71535,The Naughty Princess,Claire Contreras,"Mia Madison, Will Watt",04:46:00,2021-09-07,English,Not rated yet,680.0,4.5,0,0.0,2021,9,7,Tuesday,36
58071,Creating a Portfolio like Warren Buffett,Jeeva Ramaswamy,Kevin Young,07:37:00,2020-07-09,English,4.5 out of 5 stars6 ratings,668.0,4.5,6,4.5,2020,7,9,Thursday,28
30274,Franchise Your Business,"Mark Siebert, John Leonesio-foreword",Al Kessel,12:47:00,2018-11-13,English,5 out of 5 stars1 rating,703.0,5.0,0,5.0,2018,11,13,Tuesday,46
64115,I racconti più belli della saggezza zen,"David Santoro, Lorenzo Casadei",Federico Melis,04:53:00,2021-10-19,italian,Not rated yet,267.0,4.5,0,0.0,2021,10,19,Tuesday,42
42778,The Marriage Act,Liza Monroy,Liza Monroy,07:21:00,2014-02-19,English,Not rated yet,657.0,4.5,0,0.0,2014,2,19,Wednesday,8
7835,The Return to Phantasmagoria,El Holly,Rebecca Rogers,01:02:00,2022-01-14,English,Not rated yet,234.0,4.5,0,0.0,2022,1,14,Friday,2
989,The Drowned Vault,N. D. Wilson,Thomas Vincent Kelly,12:28:00,2012-09-11,English,Not rated yet,1206.0,4.5,0,0.0,2012,9,11,Tuesday,37
37568,The Apache Diaspora,Paul Conrad,Jonathan Yen,15:52:00,2022-03-08,English,Not rated yet,703.0,4.5,0,0.0,2022,3,8,Tuesday,10
46579,A Lesson Before Dying: CliffsNotes,Durthy A. Washington,Luke Daniels,03:12:00,2012-04-25,English,Not rated yet,352.0,4.5,0,0.0,2012,4,25,Wednesday,17


In [34]:
print("Null rows in each column:")
df.isna().sum()

Null rows in each column:


name               0
author             0
narrator           0
time               0
releasedate        0
language           0
stars              0
price              0
rating             0
num_ratings        0
Rating             0
release_year       0
release_month      0
release_day        0
release_weekday    0
release_week       0
dtype: int64

## E. Text cleaning & normalization

In [39]:
# 14.Are there leading/trailing spaces or extra internal whitespace in any text fields? How will you trim them?
df[df.select_dtypes('object').columns] = df.select_dtypes('object').apply(lambda x: x.str.strip().replace(r'\s+', ' ', regex=True))
df.sample(5)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings,Rating,release_year,release_month,release_day,release_weekday,release_week
50004,Civil War Chronicles - Komplett,Alfred Wallon,Thorsten Jost,20:42:00,2022-04-01,german,Not rated yet,669.0,4.5,0,0.0,2022,4,1,Friday,13
50202,Et familieanliggende,Rohinton Mistry,Martin Johannes Møller,19:25:00,2021-12-02,danish,Not rated yet,267.0,4.5,0,0.0,2021,12,2,Thursday,48
2795,Die schönsten griechischen Sagen,Dimiter Inkiow,Peter Kaempfe,01:18:00,2014-01-20,german,Not rated yet,233.0,4.5,0,0.0,2014,1,20,Monday,4
50461,Quichotte,Salman Rushdie,Henrik Zangenberg,13:44:00,2021-03-11,danish,Not rated yet,703.0,4.5,0,0.0,2021,3,11,Thursday,10
35773,Die Demenz und Ich - Herz über Kopf,Miriam Sonnenberg,Miriam Gronau,03:09:00,2020-11-06,german,Not rated yet,501.0,4.5,0,0.0,2020,11,6,Friday,45


In [41]:
# 15.Does casing matter? Should you standardize language, author, and narrator to title case or lowercase?
cols = ['name', 'author', 'narrator', 'language','stars','release_weekday']
df[cols] = df[cols].apply(lambda x: x.str.lower())
df.head(3)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price,rating,num_ratings,Rating,release_year,release_month,release_day,release_weekday,release_week
0,geronimo stilton #11 & #12,geronimo stilton,bill lobely,02:20:00,2008-08-04,english,5 out of 5 stars34 ratings,468.0,5.0,34,5.0,2008,8,4,monday,32
1,the burning maze,rick riordan,robbie daymond,13:08:00,2018-05-01,english,4.5 out of 5 stars41 ratings,820.0,4.5,41,4.5,2018,5,1,tuesday,18
2,the deep end,jeff kinney,dan russell,02:03:00,2020-11-06,english,4.5 out of 5 stars38 ratings,410.0,4.5,38,4.5,2020,11,6,friday,45


In [None]:
# 16.Are there punctuation or special characters to remove (e.g., stray colons, HTML entities)?

In [None]:
# 17.Do you need to correct common misspellings or inconsistent variants (e.g., Englis, english, ENGLISH)?

## F. Numeric parsing & cleaning

In [None]:
#18.	Is price stored as numeric or string? If string, how will you strip currency symbols and thousands separators then convert to float?


In [None]:
#19.	Are there non-numeric characters in numeric fields (e.g., ,, $, %)? How will you remove them safely?

In [None]:
#20.	After extracting rating (numeric) and rating_count (integer), are there any non-numeric or malformed values to handle?

## G. Consistency & canonicalization

In [None]:
#21.	Should author and narrator names be normalized to a consistent format (e.g., First Last)? How to handle initials, middle names, etc.?

## H. Duplicates & near-duplicates

In [50]:
#23.	Are there exact duplicate rows — how many and which ones should be removed?
print("no of duplicated rows",df.duplicated().sum())
df = df.drop_duplicates(keep='first')
print("no of duplicated rows",df.duplicated().sum())

no of duplicated rows 0
no of duplicated rows 0


In [None]:
#24.	Are there near-duplicates (same title + author but slightly different punctuation or whitespace)? How will you detect and consolidate them?