# Spotify Recommendation

Before running the following code, ensure you have read the "Usage" section of the README.md file explaining how to use this notebook.

In [4]:
# Import necessary packages
import pandas as pd

In [5]:
# Returns a cleaned version of the given DataFrame (leaving the original unchanged)
def clean_data(df):
    
    # Create a deep copy of the DataFrame to ensure the original is unedited
    clean_df = df.copy()

    # Find duplicate tracks
    print('Finding duplicates...')
    dupe_mask = clean_df['track_id'].duplicated(keep=False)
    dupes = clean_df[dupe_mask].sort_values('track_id')
    print(f' - Total duplicated rows: {len(dupes)}')
    num_duplicated_songs = clean_df['track_id'].duplicated().sum()
    print(f' - Number of songs with duplicates: {num_duplicated_songs}')
    print('Done')

    # Remove duplicate tracks
    num_rows_before = len(clean_df)
    print(f'\nCurrent number of rows in DataFrame: {num_rows_before}')
    print('\nRemoving duplicates...')
    clean_df = clean_df.drop_duplicates(subset='track_id', keep='first')
    print('Done')
    num_rows_after = len(clean_df)
    print(f'\nNew number of rows in DataFrame: {num_rows_after}')
    print(f'Removed {num_rows_before - num_rows_after} duplicates')
    print(f'Number of unique track_ids: {clean_df['track_id'].nunique()}')

    # Drop unnecessary columns
    print('\nDropping the unnecessary columns "track_id" and "Unnamed: 0"...')
    clean_df = clean_df.drop('track_id', axis=1)
    clean_df = clean_df.drop('Unnamed: 0', axis=1)
    print('Done')

    return clean_df

In [6]:
# Create a DataFrame object from the dataset (in CSV format)
unclean_df = pd.read_csv('dataset.csv')

# Clean the data to remove duplicates and unnecessary columns
df = clean_data(unclean_df)

# Show a random sample of 10 tracks to view some of the data
df.sample(10)

Finding duplicates...
 - Total duplicated rows: 40900
 - Number of songs with duplicates: 24259
Done

Current number of rows in DataFrame: 114000

Removing duplicates...
Done

New number of rows in DataFrame: 89741
Removed 24259 duplicates
Number of unique track_ids: 89741

Dropping the unnecessary columns "track_id" and "Unnamed: 0"...
Done


Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
17396,Glee Cast;Gwyneth Paltrow,Glee The 3D Concert Movie (Motion Picture Soun...,Forget You (feat. Gwyneth Paltrow),20,162080,False,0.571,0.947,7,-4.695,1,0.0812,0.176,0.0,0.9,0.579,127.411,4,club
61620,AKB48,UZA<Type-A>【通常盤】,次のSeason(アンダーガールズ),21,264706,False,0.468,0.919,8,-4.926,1,0.0513,0.189,7e-06,0.31,0.507,97.469,4,j-idol
85832,Drei Meter Feldweg,Gewinner,"Die Nacht, der Alkohol und wir",36,175000,False,0.469,0.945,1,-3.986,0,0.0667,0.00159,0.0,0.0816,0.463,97.583,4,punk-rock
23489,Chris Lorenzo;LP Giobbi;High Jinx,California Dreamin',California Dreamin' (feat. High Jinx) (LP Giob...,57,144761,False,0.61,0.966,11,-4.116,1,0.305,0.000645,0.247,0.303,0.631,126.001,4,deep-house
50127,Rata Blanca,Entre el Cielo y el Infierno,Sin Tu Amor Nada Existe,32,354200,False,0.534,0.648,4,-12.992,1,0.028,0.000927,0.0313,0.165,0.428,96.077,4,heavy-metal
83259,Dimitri Vangelis & Wyman,Safe,Safe,64,225230,False,0.36,0.845,0,-4.742,1,0.0344,0.0316,3.4e-05,0.063,0.231,130.066,4,progressive-house
10390,Krafty Kuts;Sporty-O,Let's Ride,Let's Ride,14,261126,False,0.803,0.86,10,-4.065,0,0.143,0.00583,1e-05,0.0453,0.603,129.98,4,breakbeat
45606,Anna Kitkowska,Anna Kitkowska,Love Me Like You Do,42,198208,False,0.635,0.231,11,-12.608,0,0.0419,0.939,0.947,0.0925,0.321,144.229,4,guitar
41503,Within Temptation;Tarja,Hydra,Paradise (What About Us?) (feat. Tarja),12,319688,False,0.461,0.928,4,-3.542,0,0.0699,0.00058,0.0011,0.615,0.433,135.031,4,goth
108822,Domingo Federico,The Roots of Tango - Mi Dolor,Para Que Te Quiero Tanto,16,156544,False,0.651,0.2,1,-8.959,0,0.0819,0.956,0.326,0.262,0.456,125.822,4,tango
