# Data cleaning

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load scraped data

In [2]:
raw_data = pd.read_csv('../Datasets/Chords_encoded.csv', index_col=0)
raw_data

Unnamed: 0,Artists,Songs,Rating,Votes,Level,A,A#,A#dim,A#m,A(maj),...,Gmaj7/D,Go,Gsus,Gsus2,Gsus2/A,Gsus2/C,Gsus4,Gsus4/B,H,Ho
0,jeff buckley,hallelujah,4.87,39010,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,john legend,all of me,4.83,19766,advanced,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,ed sheeran,perfect,4.85,30548,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,passenger,let her go,4.84,16657,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,jason mraz,im yours,4.74,11924,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,the marshall tucker band,cant you see,4.80,1966,novice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,lessons guitar,all the chords,4.92,2670,intermediate,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,creedence clearwater revival,fortunate son,4.65,457,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,no doubt,dont speak,4.82,1147,advanced,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
raw_data[["Artists","Songs","Rating","Votes","Level"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Artists  1000 non-null   object
 1   Songs    1000 non-null   object
 2   Rating   1000 non-null   object
 3   Votes    1000 non-null   int64 
 4   Level    999 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.9+ KB


## Data copy & transformation

### Copy

In [4]:
data = raw_data.copy()

### Drop outliers indexed 488, 676 & 996

In [5]:
data.iloc[[996,676,488]]

Unnamed: 0,Artists,Songs,Rating,Votes,Level,A,A#,A#dim,A#m,A(maj),...,Gmaj7/D,Go,Gsus,Gsus2,Gsus2/A,Gsus2/C,Gsus4,Gsus4/B,H,Ho
996,lessons guitar,all the chords,4.92,2670,intermediate,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
676,xtc,ball and chain,"0,&q",0,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
488,jack johnson,better together,4.73,510,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data = data.drop([996,676,488])

In [7]:
data[["Artists","Songs","Rating","Votes","Level"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Artists  997 non-null    object
 1   Songs    997 non-null    object
 2   Rating   997 non-null    object
 3   Votes    997 non-null    int64 
 4   Level    997 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.7+ KB


### Drop H & Ho columns

In [8]:
data = data.drop(["H","Ho"],axis=1)
data

Unnamed: 0,Artists,Songs,Rating,Votes,Level,A,A#,A#dim,A#m,A(maj),...,Gmaj,Gmaj7,Gmaj7/D,Go,Gsus,Gsus2,Gsus2/A,Gsus2/C,Gsus4,Gsus4/B
0,jeff buckley,hallelujah,4.87,39010,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,john legend,all of me,4.83,19766,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,ed sheeran,perfect,4.85,30548,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,passenger,let her go,4.84,16657,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,jason mraz,im yours,4.74,11924,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,tracy chapman,fast car,4.81,583,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
995,the marshall tucker band,cant you see,4.80,1966,novice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,creedence clearwater revival,fortunate son,4.65,457,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,no doubt,dont speak,4.82,1147,advanced,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Change rating type as float

In [9]:
data["Rating"] = data.Rating.astype('float64')

In [10]:
data[["Artists","Songs","Rating","Votes","Level"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Artists  997 non-null    object 
 1   Songs    997 non-null    object 
 2   Rating   997 non-null    float64
 3   Votes    997 non-null    int64  
 4   Level    997 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 46.7+ KB


In [11]:
data[["Artists","Songs","Level"]].describe()

Unnamed: 0,Artists,Songs,Level
count,997,997,997
unique,409,785,3
top,taylor swift,hallelujah,advanced
freq,33,6,752


In [12]:
data[["Rating","Votes"]].describe()

Unnamed: 0,Rating,Votes
count,997.0,997.0
mean,4.773701,1939.40321
std,0.136882,2871.447681
min,3.54,3.0
25%,4.74,497.0
50%,4.81,1127.0
75%,4.85,2214.0
max,4.93,39010.0


### Check duplicates & re-index

In [13]:
data.drop_duplicates(ignore_index=True)

Unnamed: 0,Artists,Songs,Rating,Votes,Level,A,A#,A#dim,A#m,A(maj),...,Gmaj,Gmaj7,Gmaj7/D,Go,Gsus,Gsus2,Gsus2/A,Gsus2/C,Gsus4,Gsus4/B
0,jeff buckley,hallelujah,4.87,39010,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,john legend,all of me,4.83,19766,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,ed sheeran,perfect,4.85,30548,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,passenger,let her go,4.84,16657,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,jason mraz,im yours,4.74,11924,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,tracy chapman,fast car,4.81,583,advanced,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
993,the marshall tucker band,cant you see,4.80,1966,novice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994,creedence clearwater revival,fortunate son,4.65,457,intermediate,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
995,no doubt,dont speak,4.82,1147,advanced,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<font color=blue>**No duplicates found**

### Chords frequency

In [14]:
data.iloc[:,5:].sum().sort_values(ascending=False)

G            789
C            678
D            594
Em           479
Am           447
            ... 
Dm7/G          1
Dm7b5          1
Dmadd4         1
Dmaj9          1
Dadd11/F#      1
Length: 446, dtype: int64

In [15]:
data.iloc[:,5:].sum().nlargest(5)

G     789
C     678
D     594
Em    479
Am    447
dtype: int64

In [16]:
data.iloc[:,5:].sum().nlargest(20)

G        789
C        678
D        594
Em       479
Am       447
F        374
A        328
E        247
Bm       226
Dm       177
F#m      130
B        114
Em7      102
Bb       101
D/F#     100
G/B       81
A7        80
Cadd9     76
C#m       75
D7        75
dtype: int64

## Export clean data

In [17]:
data.to_csv('../Datasets/Chords_clean.csv')