In [131]:
import pandas as pd
import numpy as np
import random

In [132]:
#increase number of rows displayed by default in jupyter notebook
pd.options.display.max_rows = 50000

In [133]:
#read data
audible_data = pd.read_csv("data/audible_dataset/audible_uncleaned.csv")

In [134]:
#preview of the data
audible_data.sample(4)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
6211,Skies Over Sweetwater,Writtenby:JuliaMoberg,Narratedby:SuzyMyers,5 hrs and 15 mins,17-09-08,English,Not rated yet,434.0
75938,Beauty and the Baller,Writtenby:IlsaMadden-Mills,"Narratedby:SebastianYork,SavannahPeachwood",10 hrs and 10 mins,29-03-22,English,Not rated yet,1008.0
38793,人をあるく　北条氏五代と小田原城,Writtenby:山口博,Narratedby:岡ゆかり,5 hrs and 48 mins,01-12-21,japanese,Not rated yet,837.0
50060,Void Moon,Writtenby:MichaelConnelly,Narratedby:L.J.Ganser,10 hrs and 59 mins,01-09-09,English,Not rated yet,500.0


In [135]:
#information about the data
audible_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         87489 non-null  object
 1   author       87489 non-null  object
 2   narrator     87489 non-null  object
 3   time         87489 non-null  object
 4   releasedate  87489 non-null  object
 5   language     87489 non-null  object
 6   stars        87489 non-null  object
 7   price        87489 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


In [136]:
#stats
audible_data.describe()

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
count,87489,87489,87489,87489,87489,87489,87489,87489.0
unique,82767,48374,29717,2284,5058,36,665,1011.0
top,The Art of War,"Writtenby:矢島雅弘,石橋遊",Narratedby:anonymous,2 mins,16-05-18,English,Not rated yet,586.0
freq,20,874,1034,372,773,61884,72417,5533.0


### 1. How to check total count of each category?

In [137]:
#checking number of occurances of books
book_counts = audible_data['name'].value_counts()
book_counts[book_counts >= 10]

The Art of War                 20
Sterling Biographies           19
The Odyssey                    16
Sterling Point Books           16
Hamlet                         15
The Prophet                    14
Pride and Prejudice            14
A Christmas Carol              14
The Iliad                      13
As a Man Thinketh              13
The Science of Getting Rich    13
The Picture of Dorian Gray     12
Abraham Lincoln                12
Meditations                    11
The Richest Man in Babylon     11
The Raven                      11
The Prince                     11
Unstoppable                    10
Name: name, dtype: int64

### 2. How to check number of duplicate rows?

In [138]:
#check for duplicate rows
audible_data.duplicated().sum()

0

In [None]:
#filter out all book names containing any special characters
#booknames_special_chars = audible_data[audible_data.name.str.contains(r'[@#\$%\+\*]')].drop_duplicates()
#number of books that contain special characters- 592 books
#booknames_special_chars.shape

#### Column 1: Author

### 3. How to remove a string pattern?

In [139]:
#Author column
#remove the phrase "Writtenby:"
audible_data['author'] = audible_data['author'].str.replace(pat = "Writtenby:", repl = "")

In [140]:
#after removing "Written By", author column looks like:
audible_data.author.sample(5)

26967                         FrancesMayes
38034          PhilipMatyszak,MichaelBayer
53866                        GasparaStampa
29384    JasonFried,DavidHeinemeierHansson
32503                                 OSHO
Name: author, dtype: object

### 4. How to add space between 2 or more word blocks? (here first, middle, last name)

In [141]:
#Add space between the first, middle and last names of Authors.
#e.g. JaneAustin becomes Jane Austin
audible_data['author'] = audible_data['author'].str.replace(pat = r"(\w)([A-Z])", repl = r"\1 \2", regex = True)

In [142]:
audible_data.author.sample(5)

30747               Brynne Conroy
27685    Douglas Max,Robert Bacal
81438            Erin Mc Kittrick
67782                Russell Targ
41880             Arrigo Cipriani
Name: author, dtype: object

### 5. How to split a value into multiple columns and assign column name with prefix/suffix?

In [143]:
audible_data2 = pd.concat( [audible_data['name'], 
                            audible_data['author'].str.split(',', expand = True).add_prefix('author'),
                            audible_data.loc[:,['narrator', 'time', 'releasedate', 'language', 'stars', 'price']]], 
                            axis = 1)

### 6. How to find total number of NA in some specific columns?

In [144]:
#number of missing values in author columns
audible_data2.loc[:,['author0','author1','author2','author3']].isnull().sum()

author0        0
author1    73762
author2    85135
author3    86713
dtype: int64

In [145]:
#remove "Narratedby:" from Narrator column
audible_data2['narrator'] = audible_data2['narrator'].str.replace(pat = "Narratedby:", repl = "")

In [146]:
#add space between first and last name of the Narrator
#e.g. JaneAustin becomes Jane Austin
audible_data2['narrator'] = audible_data2['narrator'].str.replace(pat = r"(\w)([A-Z])", repl = r"\1 \2", regex = True)

In [147]:
audible_data2.sample(5)

Unnamed: 0,name,author0,author1,author2,author3,narrator,time,releasedate,language,stars,price
57733,Über Leben,Dirk Steffens,Fritz Habekuß,,,"Dirk Steffens,Fritz Habekuß",6 hrs and 25 mins,11-05-20,german,Not rated yet,468.0
45308,The Winds of Darkover [International Edition],Marion Zimmer Bradley,,,,Pete Bradbury,5 hrs and 29 mins,16-11-21,English,Not rated yet,703.0
78074,The Hardest Route,A.S.Teague,,,,"Elizabeth Hart,Alex Kydd",8 hrs and 58 mins,16-07-19,English,Not rated yet,586.0
28137,Shine,Carley Hauck,,,,Carley Hauck,8 hrs and 58 mins,23-02-21,English,Not rated yet,703.0
4536,Maxima von Abundancia,Viola Sonntag,,,,Viola Sonntag,2 hrs and 23 mins,29-03-22,german,Not rated yet,367.0


In [148]:
#make a copy of time column to understand all kind of formats in which data is present
time_column = audible_data2['time']
#time_column_copy = time_column.copy()

In [149]:
#replace all numbers with blanks
time_column = time_column.str.replace(pat = r'[0-9]', repl = '', regex = True)
#keep only unique patterns
time_column.drop_duplicates()

0           hrs and  mins
4                     hrs
12           hrs and  min
29           hr and  mins
53                   mins
227                    hr
255           hr and  min
1203                  min
1401    Less than  minute
Name: time, dtype: object

In [150]:
#find rows with "less than x minute" values in audiobook duration
less_than_duration = audible_data2[audible_data2.time.str.contains(pat = 'less than', case = False)].index
#check all unique values with 'less than' pattern
audible_data2.time[less_than_duration].drop_duplicates()

1401    Less than 1 minute
Name: time, dtype: object

In [151]:
#create 2 columns- 1 for hour component and the other for minute component.
audible_data2['hour_component'] = 0
audible_data2['min_component'] = 0

### 7. How to extract digits before a specifc word?

In [152]:
#Extract the hour component of audiobook duration into hour column and minute into min_component column
audible_data2['hour_component'] = audible_data2.time.str.extract(pat = r'^(\d+) hr')
audible_data2['min_component'] = audible_data2.time.str.extract(pat = r'(\d+) min')

In [153]:
random.seed(10)
audible_data2.loc[:,['time', 'hour_component', 'min_component']].sample(6)

Unnamed: 0,time,hour_component,min_component
65382,2 hrs and 8 mins,2.0,8
54447,11 hrs and 33 mins,11.0,33
16823,1 hr and 16 mins,1.0,16
85144,8 hrs and 35 mins,8.0,35
10221,4 mins,,4
18810,5 hrs and 58 mins,5.0,58


### 8. How to convert NaN values to 0?

In [154]:
#convert all NaN values in hour and minute component column to 0
audible_data2['hour_component'] = audible_data2['hour_component'].fillna(0)
audible_data2['min_component'] = audible_data2['min_component'].fillna(0)

In [155]:
#sample preview
audible_data2.loc[[328,5532,1583], ['time', 'hour_component', 'min_component']]

Unnamed: 0,time,hour_component,min_component
328,11 mins,0,11
5532,12 mins,0,12
1583,5 mins,0,5


### 9. How to change the datatype of multiple columns?

In [156]:
#convert the hour and min column to int type
audible_data2 = audible_data2.astype({'hour_component':'int','min_component':'int'})

In [158]:
audible_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            87489 non-null  object
 1   author0         87489 non-null  object
 2   author1         13727 non-null  object
 3   author2         2354 non-null   object
 4   author3         776 non-null    object
 5   narrator        87489 non-null  object
 6   time            87489 non-null  object
 7   releasedate     87489 non-null  object
 8   language        87489 non-null  object
 9   stars           87489 non-null  object
 10  price           87489 non-null  object
 11  hour_component  87489 non-null  int32 
 12  min_component   87489 non-null  int32 
dtypes: int32(2), object(11)
memory usage: 8.0+ MB


In [159]:
audible_data2.releasedate

0        04-08-08
1        01-05-18
2        06-11-20
3        05-10-21
4        13-01-10
           ...   
87484    09-03-17
87485    21-02-17
87486    30-12-16
87487    23-02-11
87488    07-03-17
Name: releasedate, Length: 87489, dtype: object

### 10. How to check for inconsistencies in date column stored as a string?

In [164]:
#check for inconsistencies in month of release date
#Check to see whether middle values (months) contain any number > 12
#all unique months. 
audible_data2.releasedate.str.extract(pat = r'-(\d+)-').drop_duplicates()

Unnamed: 0,0
0,8
1,5
2,11
3,10
4,1
9,9
14,6
15,2
16,12
21,4


In [165]:
#check inconsistencies in year value of release date
#future release date is also present, hence values 25 for year 2025, 24 for year 2024...
audible_data2.releasedate.str.extract(pat = r'-(\d+)$').drop_duplicates()

Unnamed: 0,0
0,8
1,18
2,20
3,21
4,10
6,14
7,17
9,19
11,11
16,4


In [166]:
#in day value
#check if any value in day part > 31.
any(audible_data2.releasedate.str.extract(pat = r'^(\d+)-').drop_duplicates().astype(int) > 31)

False

### 11. How to convert date stored as string to 'date' type? 

Note: Check the separator used between day, month and year.  

- If date is like- 02/10/2023, use the format- **%d/%m/%Y**.  
- If date is like- 02-10-2023, use the format- **%d-%m-%Y**.  
- If year has 2 digits as in 02-10-23, use format- **%d-%m-%y**.  
- If year has 4 digits as in 02-10-2023, use Y in capital- **%d-%m-%Y**.

In [167]:
#convert release-date from string to date type
audible_data2.releasedate = pd.to_datetime(audible_data2.releasedate, format = '%d-%m-%y')

In [168]:
audible_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   name            87489 non-null  object        
 1   author0         87489 non-null  object        
 2   author1         13727 non-null  object        
 3   author2         2354 non-null   object        
 4   author3         776 non-null    object        
 5   narrator        87489 non-null  object        
 6   time            87489 non-null  object        
 7   releasedate     87489 non-null  datetime64[ns]
 8   language        87489 non-null  object        
 9   stars           87489 non-null  object        
 10  price           87489 non-null  object        
 11  hour_component  87489 non-null  int32         
 12  min_component   87489 non-null  int32         
dtypes: datetime64[ns](1), int32(2), object(10)
memory usage: 8.0+ MB


### 12. How to convert text to one uniform case? 

In [171]:
#language column
print(audible_data2.language.drop_duplicates().tolist())

['English', 'Hindi', 'Spanish', 'German', 'French', 'Catalan', 'Swedish', 'Italian', 'Danish', 'Finnish', 'Dutch', 'Hebrew', 'Russian', 'Polish', 'Galician', 'Afrikaans', 'Icelandic', 'Romanian', 'Japanese', 'Tamil', 'Portuguese', 'Urdu', 'Hungarian', 'Czech', 'Bulgarian', 'Mandarin_Chinese', 'Basque', 'Korean', 'Arabic', 'Greek', 'Turkish', 'Ukrainian', 'Slovene', 'Norwegian', 'Telugu', 'Lithuanian']


In [170]:
#make language names uniform by capitalizing the first letter.
audible_data2.language = audible_data2.language.str.title()

### 13. How to split a text column with a specific word acting as separator?

The objective is to create 2 columns out of the stars column:  
- **`stars_out_of_5`-** will contain numeric values like 2, 4.5 etc.  
- **`total_ratings`-** total number of reviews received on the particular audiobook.   



- This column will be split after the word 'stars'.  
- Then the first number before 'out' will be extracted and converted to int type.  
- In the `total_ratings` column, "ratings" will be removed and values will be converted to int type.  

In [172]:
#unique stars
audible_data2.stars.sample(5)

25955               Not rated yet
60409               Not rated yet
44213               Not rated yet
85258    5 out of 5 stars1 rating
40042               Not rated yet
Name: stars, dtype: object

In [173]:
#split stars column into 2
audible_data2[['stars_out_of_5', 'total_ratings']] = audible_data2.stars.str.split('stars', expand = True)

In [174]:
audible_data2.sample(4)

Unnamed: 0,name,author0,author1,author2,author3,narrator,time,releasedate,language,stars,price,hour_component,min_component,stars_out_of_5,total_ratings
13602,Serbian - Hindi. A complete method,J.M.Gardner,,,,"Dunja,Tulika",4 hrs and 19 mins,2020-11-13,English,Not rated yet,376.0,4,19,Not rated yet,
23190,Reminiscences of a Stock Operator (Wiley Tradi...,Edwin Lefevre,,,,uncredited,2 hrs and 28 mins,2013-10-29,English,4.5 out of 5 stars36 ratings,233.0,2,28,4.5 out of 5,36 ratings
34242,Unsavory Truth,Marion Nestle,,,,Norah Tocci,9 hrs and 39 mins,2018-11-07,English,5 out of 5 stars1 rating,500.0,9,39,5 out of 5,1 rating
1577,P'tit Loup visite une ferme,Orianne Lallemand,,,,Will Production,3 mins,2022-01-28,French,Not rated yet,74.0,0,3,Not rated yet,


In [175]:
audible_data2_copy = audible_data2.copy()

In [178]:
#replace 'Not rated yet' rows with NA

#rows with not yet rated value
not_rated_rows = audible_data2[audible_data2.stars_out_of_5.str.contains(pat = 'Not rated yet', case = False)].index
#replace 'Not yet rated' and 'None' with NA
audible_data2.loc[not_rated_rows,['stars_out_of_5', 'total_ratings']] = np.nan

In [179]:
#remove 'out of 5' part
audible_data2.stars_out_of_5 = audible_data2.stars.str.extract(pat = r'^(\d+)')

In [182]:
temp = audible_data2.copy()

In [187]:
temp['out_of_5'] = temp.stars_out_of_5.str.extract(pat = r'^(\d+|\d{1}\.\d{1})')
temp.loc[:,['stars_out_of_5', 'out_of_5']].head(10)

Unnamed: 0,stars_out_of_5,out_of_5
0,5 out of 5,5
1,4.5 out of 5,4
2,4.5 out of 5,4
3,4.5 out of 5,4
4,4.5 out of 5,4
5,5 out of 5,5
6,5 out of 5,5
7,5 out of 5,5
8,5 out of 5,5
9,5 out of 5,5
