# Data cleaning

### Import libraries

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load scraped data

In [103]:
raw_data = pd.read_csv('../Datasets/Chords_encoded.csv', index_col=0)
raw_data

Unnamed: 0,Artists,Songs,Rating,Votes,Level,A,A#m,A(maj),A5,A6,...,G,G#dim,G#dim7,G5,G9,Gadd9,Gb,Gm,Gm/Bb,Gm6
0,jeff buckley,hallelujah,4.87,38995,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,john legend,all of me,4.83,19759,expert,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed sheeran,perfect,4.85,30528,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,passenger,let her go,4.84,16650,intermediate,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,jason mraz,im yours,4.74,11920,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,lessons guitar,all the chords,4.92,2670,novice,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,the marshall tucker band,cant you see,4.80,1964,novice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,creedence clearwater revival,fortunate son,4.65,457,intermediate,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
998,no doubt,dont speak,4.82,1145,expert,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
raw_data[["Artists","Songs","Rating","Votes","Level"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Artists  1000 non-null   object
 1   Songs    1000 non-null   object
 2   Rating   1000 non-null   object
 3   Votes    1000 non-null   int64 
 4   Level    999 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.9+ KB


## Data copy & transformation

### Copy

In [106]:
data = raw_data.copy()

### Drop outliers indexed 488, 676 & 995

In [107]:
data.iloc[[995,676,488]]

Unnamed: 0,Artists,Songs,Rating,Votes,Level,A,A#m,A(maj),A5,A6,...,G,G#dim,G#dim7,G5,G9,Gadd9,Gb,Gm,Gm/Bb,Gm6
995,lessons guitar,all the chords,4.92,2670,novice,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
676,xtc,ball and chain,"0,&q",0,expert,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
488,jack johnson,better together,4.73,510,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
data = data.drop([995,676,488])

In [110]:
data[["Artists","Songs","Rating","Votes","Level"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Artists  997 non-null    object
 1   Songs    997 non-null    object
 2   Rating   997 non-null    object
 3   Votes    997 non-null    int64 
 4   Level    997 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.7+ KB


### Change rating type as float

In [111]:
data["Rating"] = data.Rating.astype('float64')

In [113]:
data[["Artists","Songs","Rating","Votes","Level"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Artists  997 non-null    object 
 1   Songs    997 non-null    object 
 2   Rating   997 non-null    float64
 3   Votes    997 non-null    int64  
 4   Level    997 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 46.7+ KB


In [115]:
data[["Artists","Songs","Level"]].describe()

Unnamed: 0,Artists,Songs,Level
count,997,997,997
unique,409,786,4
top,taylor swift,hallelujah,advanced
freq,33,6,313


In [116]:
data[["Rating","Votes"]].describe()

Unnamed: 0,Rating,Votes
count,997.0,997.0
mean,4.773661,1938.692076
std,0.136883,2869.634242
min,3.54,3.0
25%,4.74,497.0
50%,4.81,1127.0
75%,4.85,2221.0
max,4.93,38995.0


### Check duplicates & re-index

In [117]:
data.drop_duplicates(ignore_index=True)

Unnamed: 0,Artists,Songs,Rating,Votes,Level,A,A#m,A(maj),A5,A6,...,G,G#dim,G#dim7,G5,G9,Gadd9,Gb,Gm,Gm/Bb,Gm6
0,jeff buckley,hallelujah,4.87,38995,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,john legend,all of me,4.83,19759,expert,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed sheeran,perfect,4.85,30528,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,passenger,let her go,4.84,16650,intermediate,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,jason mraz,im yours,4.74,11920,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,tracy chapman,fast car,4.81,583,expert,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
993,the marshall tucker band,cant you see,4.80,1964,novice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994,creedence clearwater revival,fortunate son,4.65,457,intermediate,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
995,no doubt,dont speak,4.82,1145,expert,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<font color=blue>No duplicates found

### Re-index dataframe

In [119]:
print(data.columns[5:])

Index(['A', 'A#m', 'A(maj)', 'A5', 'A6', 'A7', 'Aadd9', 'Ab', 'Am', 'Am/B',
       'Am/C', 'Am7', 'Am7/G', 'Amaj7/B', 'Asus2', 'Asus2/E', 'Asus4', 'B',
       'B5', 'Baug', 'Bb', 'Bb/A', 'Bbdim', 'Bm', 'Bm/G', 'Bm7', 'Bm9',
       'Bmaj7', 'Bsus2', 'C', 'C#', 'C#5', 'C#7', 'C#M', 'C#dim', 'C#m',
       'C#m7', 'C/B', 'C2', 'C7/E', 'CM7', 'CM9', 'Cm/D#', 'Cm/F', 'Cm6/D',
       'Cmaj7', 'Cmaj9', 'D', 'D#5', 'D#7', 'D/A', 'D5', 'D6', 'D7/F#',
       'D7sus4/G', 'D9', 'DM7', 'Dadd9', 'Dm', 'Dm7', 'Dmaj9', 'Dsus2',
       'Dsus4', 'E', 'E/B', 'E6', 'E7/G#', 'Eb/G', 'EbM7', 'Em/B', 'Em/C#',
       'Em/D', 'Em7', 'Esus', 'Esus2', 'F', 'F#7', 'F#m/E', 'F#sus4', 'F/E',
       'F/G', 'F6/9', 'F7', 'FM7', 'Fm', 'Fm7', 'Fmaj', 'Fmaj7#11', 'Fsus2',
       'G', 'G#dim', 'G#dim7', 'G5', 'G9', 'Gadd9', 'Gb', 'Gm', 'Gm/Bb',
       'Gm6'],
      dtype='object')
