**Problem:**

You are given the following dataset:
1. **Audible Data** : https://1drv.ms/u/s!AiqdXCxPTydhoog8ckLN-6Cw55fzIg?e=EWgZ5d

Your task is to:
- Find the problems with the datasets.
- Define the Data Quality Dimensions.
- Try to clean the datasets.

* `name` column has differant format
*  Data Redundancy such `author`, `narrator` columns
* `time`,  `releasedate`  column format issue, sometimes time in hours only, column only, no combination hours and minutes format
* `language` column has case issue
* `stars` column avg star rating and total rating together
* `price` column, there is not a sigle row that has used natural digits after decimal, only .00, e.g. memory consumption

In [114]:
import pandas as pd
books_df = pd.read_csv('audible_uncleaned.csv')
books_df.head()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Writtenby:JeffKinney,Narratedby:DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,Writtenby:RickRiordan,Narratedby:SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",Writtenby:RickRiordan,Narratedby:JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0


In [115]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         87489 non-null  object
 1   author       87489 non-null  object
 2   narrator     87489 non-null  object
 3   time         87489 non-null  object
 4   releasedate  87489 non-null  object
 5   language     87489 non-null  object
 6   stars        87489 non-null  object
 7   price        87489 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


In [116]:
import re

In [117]:
# 1. Handling Strange Characters
books_df['author'] = books_df['author'].str.replace(r'Ã', 'a').str.replace(r'â€™', '\'')
books_df['narrator'] = books_df['narrator'].str.replace(r'Ã', 'a').str.replace(r'â€™', '\'')
books_df['name'] = books_df['name'].str.replace(r'â€™', '\'')

In [118]:
# 2. Separating Authors and Narrators
books_df['author'] = books_df['author'].str.replace(r'Writtenby:', '').str.split(',')
books_df['narrator'] = books_df['narrator'].str.replace(r'Narratedby:', '')

In [119]:
def add_space_to_author(author_list):
    # Join the authors into a single string
    author_str = ', '.join(author_list)
    # Insert space before each capital letter (except the first letter)
    formatted_author = re.sub(r'(?<!^)(?=[A-Z])', ' ', author_str)
    return formatted_author

books_df['author'] = books_df['author'].apply(add_space_to_author)

In [120]:
def add_space_to_narrator(narrator_name):
    # Insert space before each capital letter (except the first letter)
    formatted_narrator = re.sub(r'(?<!^)(?=[A-Z])', ' ', narrator_name)
    return formatted_narrator

# Clean the narrator names
books_df['narrator'] = books_df['narrator'].apply(add_space_to_narrator)

In [121]:
# 3. Cleaning Name Column
books_df['name'] = books_df['name'].str.replace(r'#\d+', '').str.strip()

In [122]:
# 4. Parsing Time
def convert_time_to_minutes(time_str):
    hours = re.search(r'(\d+)\s*hrs', time_str)
    minutes = re.search(r'(\d+)\s*mins?', time_str)
    total_minutes = (int(hours.group(1)) * 60) if hours else 0
    total_minutes += int(minutes.group(1)) if minutes else 0
    return total_minutes

books_df['time_minutes'] = books_df['time'].apply(convert_time_to_minutes)

In [123]:
books_df.drop('time', axis=1, inplace=True)

In [124]:
# 5. Standardizing Release Dates
books_df['releasedate'] = pd.to_datetime(books_df['releasedate'], errors='coerce')

  books_df['releasedate'] = pd.to_datetime(books_df['releasedate'], errors='coerce')


In [125]:
# 6. Standardizing Language
books_df['language'] = books_df['language'].str.lower()

In [126]:
# 7. Cleaning Star Ratings
def clean_star_ratings(star_string):
    if "Not rated yet" in star_string:
        return 0, 0  # Assuming 0 stars and 0 ratings
    match = re.search(r'(\d+(?:\.\d+)?)\s*out of 5 stars(\d+)', star_string)
    if match:
        avg_stars = float(match.group(1))
        total_ratings = int(match.group(2))
        return avg_stars, total_ratings
    return np.nan, np.nan

books_df[['avg_stars', 'total_ratings']] = books_df['stars'].apply(clean_star_ratings).apply(pd.Series)
books_df['total_ratings'] = books_df['total_ratings'].astype(int)
books_df.drop('stars', axis=1, inplace=True)

In [127]:
# 8. Handling Price Values
books_df['price'] = pd.to_numeric(books_df['price'], errors='coerce')  # Convert prices to numeric
books_df['price'] = books_df['price'].fillna(0)  # Assume 'Free' translates to 0

In [128]:
books_df.isna().sum()

Unnamed: 0,0
name,0
author,0
narrator,0
releasedate,0
language,0
price,0
time_minutes,0
avg_stars,0
total_ratings,0


In [129]:
books_df

Unnamed: 0,name,author,narrator,releasedate,language,price,time_minutes,avg_stars,total_ratings
0,Geronimo Stilton #11 & #12,Geronimo Stilton,Bill Lobely,2008-04-08,english,468.0,140,5.0,34
1,The Burning Maze,Rick Riordan,Robbie Daymond,2018-01-05,english,820.0,788,4.5,41
2,The Deep End,Jeff Kinney,Dan Russell,2020-06-11,english,410.0,123,4.5,38
3,Daughter of the Deep,Rick Riordan,Soneela Nankani,2021-05-10,english,615.0,676,4.5,12
4,"The Lightning Thief: Percy Jackson, Book 1",Rick Riordan,Jesse Bernstein,2010-01-13,english,820.0,600,4.5,181
...,...,...,...,...,...,...,...,...,...
87484,Last Days of the Bus Club,Chris Stewart,Chris Stewart,2017-09-03,english,596.0,454,0.0,0
87485,The Alps,Stephen O' Shea,Robert Fass,2017-02-21,english,820.0,607,0.0,0
87486,The Innocents Abroad,Mark Twain,Flo Gibson,2016-12-30,english,938.0,1144,0.0,0
87487,A Sentimental Journey,Laurence Sterne,Anton Lesser,2011-02-23,english,680.0,248,0.0,0
