In [1]:
import pandas as pd 
import random
import re

In [2]:
p = 0.02 # to randomly select 2% of the rows
df = pd.read_csv('spotify_dataset.csv', on_bad_lines='skip', skiprows=lambda i: i>0 and random.random() > p)

In [3]:
df.head(20)

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,"Crosby, Stills & Nash",Helplessly Hoping,HARD ROCK 2010
1,07f0fc3be95dcd878966b1f9572ff670,C418,Chris,C418
2,07f0fc3be95dcd878966b1f9572ff670,C418,Équinoxe,C418
3,07f0fc3be95dcd878966b1f9572ff670,Pretty Lights,I Know the Truth,Chill out
4,07f0fc3be95dcd878966b1f9572ff670,Ludwig van Beethoven,"Sonata No. 23 In F Minor, Op. 57 Appassionata""...",Classique
5,07f0fc3be95dcd878966b1f9572ff670,Ratatat,Kennedy,Electro
6,07f0fc3be95dcd878966b1f9572ff670,The Prodigy,Take Me To The Hospital,Electro
7,07f0fc3be95dcd878966b1f9572ff670,Skrillex,Bangarang (feat. Sirah),Soirée
8,07f0fc3be95dcd878966b1f9572ff670,Shaka Ponk,Let's Bang,Soirée
9,07f0fc3be95dcd878966b1f9572ff670,Dschinghis Khan,Moskau - Long Version,Soirée


In [4]:
df.shape

(258410, 4)

In [5]:
# Clean names of columns 

df.columns = df.columns.str.replace('"', '')
df.columns = df.columns.str.replace('name', '')
df.columns = df.columns.str.replace(' ', '')
df.columns

Index(['user_id', 'artist', 'track', 'playlist'], dtype='object')

In [6]:
df.head()

Unnamed: 0,user_id,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,"Crosby, Stills & Nash",Helplessly Hoping,HARD ROCK 2010
1,07f0fc3be95dcd878966b1f9572ff670,C418,Chris,C418
2,07f0fc3be95dcd878966b1f9572ff670,C418,Équinoxe,C418
3,07f0fc3be95dcd878966b1f9572ff670,Pretty Lights,I Know the Truth,Chill out
4,07f0fc3be95dcd878966b1f9572ff670,Ludwig van Beethoven,"Sonata No. 23 In F Minor, Op. 57 Appassionata""...",Classique


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258410 entries, 0 to 258409
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   258410 non-null  object
 1   artist    257732 non-null  object
 2   track     258408 non-null  object
 3   playlist  258396 non-null  object
dtypes: object(4)
memory usage: 7.9+ MB


In [8]:
#Check missing values in the dataset and print the percentage of them for each columns 
count = df.isnull().sum().sort_values(ascending=False)
percentage = ((df.isnull().sum()/len(df)*100)).sort_values(ascending=False)
missing_data = pd.concat([count,percentage],axis=1, keys=['Count','Percentage'])

print(missing_data)

          Count  Percentage
artist      678    0.262374
playlist     14    0.005418
track         2    0.000774
user_id       0    0.000000


In [9]:
# Remove rows with missing values

df = df.dropna()
df = df.reset_index(drop = True)

In [10]:
df.shape

(257717, 4)

In [11]:
# Remove special characters
def text_clean(text):
    text = re.sub(r"[^a-zA-Z:$-,%.?!]+", ' ', text)
    return text

In [12]:
df['artist'] = df['artist'].str.replace('[^\w\s]','', regex = True)
df['track'] = df['track'].str.replace('[^\w\s]','', regex = True)

In [13]:
df.head(20)

Unnamed: 0,user_id,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010
1,07f0fc3be95dcd878966b1f9572ff670,C418,Chris,C418
2,07f0fc3be95dcd878966b1f9572ff670,C418,Équinoxe,C418
3,07f0fc3be95dcd878966b1f9572ff670,Pretty Lights,I Know the Truth,Chill out
4,07f0fc3be95dcd878966b1f9572ff670,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique
5,07f0fc3be95dcd878966b1f9572ff670,Ratatat,Kennedy,Electro
6,07f0fc3be95dcd878966b1f9572ff670,The Prodigy,Take Me To The Hospital,Electro
7,07f0fc3be95dcd878966b1f9572ff670,Skrillex,Bangarang feat Sirah,Soirée
8,07f0fc3be95dcd878966b1f9572ff670,Shaka Ponk,Lets Bang,Soirée
9,07f0fc3be95dcd878966b1f9572ff670,Dschinghis Khan,Moskau Long Version,Soirée


In [14]:
def remove_whitespace(text):
    """ This function will remove 
        extra whitespaces from the text
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" after extra whitespaces removed .
        
    Example:
    Input : How   are   you   doing   ?
    Output : How are you doing ?     
        
    """
    pattern = re.compile(r'\s+') 
    Without_whitespace = re.sub(pattern, ' ', text)
    # There are some instances where there is no space after '?' & ')', 
    # So I am replacing these with one space so that It will not consider two words as one token.
    text = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
    return text

In [15]:
# Create column with genius links
df['artist_track'] = pd.DataFrame((df['artist'] + " " + df['track'] + ' lyrics').apply(remove_whitespace)) 
df['artist_track'] = df['artist_track'].str.strip().replace(' ','-', regex = True)

# Remove extra whitespaces from other columns
df['user_id'] = (df['user_id'].apply(remove_whitespace)).str.strip()
df['artist'] = (df['artist'].apply(remove_whitespace)).str.strip()
df['track'] = (df['track'].apply(remove_whitespace)).str.strip()
df['playlist'] = (df['playlist'].apply(remove_whitespace)).str.strip()
df.head()

Unnamed: 0,user_id,artist,track,playlist,artist_track
0,9cc0cfd4d7d7885102480dd99e7a90d6,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010,Crosby-Stills-Nash-Helplessly-Hoping-lyrics
1,07f0fc3be95dcd878966b1f9572ff670,C418,Chris,C418,C418-Chris-lyrics
2,07f0fc3be95dcd878966b1f9572ff670,C418,Équinoxe,C418,C418-Équinoxe-lyrics
3,07f0fc3be95dcd878966b1f9572ff670,Pretty Lights,I Know the Truth,Chill out,Pretty-Lights-I-Know-the-Truth-lyrics
4,07f0fc3be95dcd878966b1f9572ff670,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique,Ludwig-van-Beethoven-Sonata-No-23-In-F-Minor-O...


In [16]:
df.to_csv('dataframe.csv',index = False)