In [35]:
import pandas as pd

df = pd.read_csv('dataset.csv',index_col=0)
df.describe(include=['object', 'bool'])

Unnamed: 0,track_id,artists,album_name,track_name,explicit,genre
count,114000,113999,113999,113999,114000,114000
unique,89741,31437,46589,73608,2,114
top,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,False,acoustic
freq,9,279,195,151,104253,1000


In [36]:
ncols, nrows = df.shape
print(f'Dataset has {ncols} rows and {nrows} columns')
df.drop_duplicates(subset=['track_name', 'artists'], inplace=True)
print(f'After dropping rows with duplicated songs, there are {df.shape[0]} rows left')


Dataset has 114000 rows and 20 columns
After dropping rows with duplicated songs, there are 81344 rows left


In [37]:
# Missing values in each row
missing_values_per_row = df.isnull().sum(axis=1)
count_per_missing_value = missing_values_per_row.value_counts().sort_index()

# Print the results
for missing, rows in count_per_missing_value.items():
    print(f'{rows} row(s) have {missing} missing values')

total_rows_with_missing_values = (df.isnull().any(axis=1)).sum()
print(f'Total number of rows with missing values: {total_rows_with_missing_values}')

row_drop = df[df.isnull().any(axis=1)].index
df.drop(row_drop, inplace=True)

print(f'Row dropped. new shape: {df.shape}')

81343 row(s) have 0 missing values
1 row(s) have 3 missing values
Total number of rows with missing values: 1
Row dropped. new shape: (81343, 20)


'acoustic' 'afrobeat' 'alt-rock' 'ambient' 'black-metal' 'blues'
 'breakbeat' 'cantopop' 'chicago-house' 'chill' 'classical' 'club'
 'comedy' 'country' 'dance' 'dancehall' 'death-metal' 'deep-house'
 'detroit-techno' 'disco' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'guitar' 'hard-rock' 'hardcore'
 'hardstyle' 'heavy-metal' 'hip-hop' 'house' 'indian' 'indie-pop'
 'industrial' 'jazz' 'k-pop' 'metal' 'metalcore' 'minimal-techno'
 'new-age' 'opera' 'party' 'piano' 'pop' 'pop-film' 'power-pop'
 'progressive-house' 'psych-rock' 'punk' 'punk-rock' 'rock' 'rock-n-roll'
 'romance' 'sad' 'salsa' 'samba' 'sertanejo' 'show-tunes'
 'singer-songwriter' 'ska' 'sleep' 'songwriter' 'soul' 'spanish' 'swedish'
 'tango' 'techno' 'trance' 'trip-hop'

In [38]:
# combining similar genres
df['genre'] = df['genre'].replace(['alternative', 'grunge'], 'alt-rock')
df['genre'] = df['genre'].replace(['emo', 'punk', 'garage'], 'punk-rock')
df['genre'] = df['genre'].replace(['hard-rock', 'rock-n-roll', 'heavy metal'], 'rock')
df['genre'] = df['genre'].replace(['dance', 'indie-pop', 'synth-pop'], 'pop')
df['genre'] = df['genre'].replace(['electro', 'trance', 'techno', 'idm', 'house'], 'edm')

In [41]:
# filter our some genres that are unpopular
selected_genres = ['alt-rock', 'classical', 'country', 'edm',
                   'hip-hop', 'jazz', 'latin', 'pop','punk-rock', 'rock']
filtered_df = df[df['genre'].isin(selected_genres)].copy()
print(f'New shape: {filtered_df.shape}')


New shape: (13558, 20)


In [42]:
filtered_df.to_csv('cleaned_data_f.csv', index=False)
print("successfully made new CSV file with cleaned data")

successfully made new CSV file with cleaned data
