<h2>Data Preprocessing - Most Streamed Spotify Songs 2024</h2>

<h3>Importing libraries</h3>

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

<h3>Loading dataset</h3>

In [2]:
df = pd.read_csv("./data/spotify_data.csv", encoding="ISO-8859-1")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 

In [4]:
df.drop(columns=["TIDAL Popularity", "SiriusXM Spins", "Soundcloud Streams"], inplace=True)

<h3>Handle Continuous Columns</h3>

In [5]:
df_continuous = df.select_dtypes(include=["float64"])

In [6]:
df_continuous.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track Score                 4600 non-null   float64
 1   Spotify Popularity          3796 non-null   float64
 2   Apple Music Playlist Count  4039 non-null   float64
 3   Deezer Playlist Count       3679 non-null   float64
 4   Amazon Playlist Count       3545 non-null   float64
dtypes: float64(5)
memory usage: 179.8 KB


<h4>Taking care of NaNs (Dummy Variable & Mean)</h4>

In [7]:
COLS = ["Spotify Popularity", "Apple Music Playlist Count", "Deezer Playlist Count", "Amazon Playlist Count"]

for col in COLS:
    df_continuous[f"{col} is NaN"] = df_continuous[col].isna().astype(int)

In [8]:
from sklearn.impute import SimpleImputer

df_continuous[COLS] = SimpleImputer(strategy="mean").fit_transform(df_continuous[COLS])

In [9]:
df_continuous.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Track Score                        4600 non-null   float64
 1   Spotify Popularity                 4600 non-null   float64
 2   Apple Music Playlist Count         4600 non-null   float64
 3   Deezer Playlist Count              4600 non-null   float64
 4   Amazon Playlist Count              4600 non-null   float64
 5   Spotify Popularity is NaN          4600 non-null   int32  
 6   Apple Music Playlist Count is NaN  4600 non-null   int32  
 7   Deezer Playlist Count is NaN       4600 non-null   int32  
 8   Amazon Playlist Count is NaN       4600 non-null   int32  
dtypes: float64(5), int32(4)
memory usage: 251.7 KB


<h3>Handle Categorical Columns</h3>

In [10]:
df_categorical = df.select_dtypes(include="object")

In [11]:
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Track                   4600 non-null   object
 1   Album Name              4600 non-null   object
 2   Artist                  4595 non-null   object
 3   Release Date            4600 non-null   object
 4   ISRC                    4600 non-null   object
 5   All Time Rank           4600 non-null   object
 6   Spotify Streams         4487 non-null   object
 7   Spotify Playlist Count  4530 non-null   object
 8   Spotify Playlist Reach  4528 non-null   object
 9   YouTube Views           4292 non-null   object
 10  YouTube Likes           4285 non-null   object
 11  TikTok Posts            3427 non-null   object
 12  TikTok Likes            3620 non-null   object
 13  TikTok Views            3619 non-null   object
 14  YouTube Playlist Reach  3591 non-null   object
 15  AirP

<h4>Transforming string numbers into float64 or int64</h4>

In [12]:
COLS = ['Spotify Streams', 'Spotify Playlist Count','Spotify Playlist Reach', 
        'YouTube Views', 'YouTube Likes','TikTok Posts', 'TikTok Likes', 
        'TikTok Views','YouTube Playlist Reach', 'AirPlay Spins', 
        'Deezer Playlist Reach', 'Pandora Streams', 
        'Pandora Track Stations', 'Shazam Counts']

for col in COLS:
    df_categorical[f"{col} is NaN"] = df_categorical[col].isna().astype(int)
    df_categorical[col] = df_categorical[col].str.replace(",", "").astype("float64")

In [13]:
df_categorical["All Time Rank"] = df_categorical["All Time Rank"].str.replace(",", "").astype("int64")

<h4>Taking care of NaNs</h4>

In [14]:
from sklearn.impute import SimpleImputer

df_categorical[COLS] = SimpleImputer(strategy="mean").fit_transform(df_categorical[COLS])

In [15]:
COLS = ["Track", "Album Name", "Artist", "Release Date", "ISRC"]

df_categorical.drop(columns=COLS, inplace=True)

In [16]:
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   All Time Rank                  4600 non-null   int64  
 1   Spotify Streams                4600 non-null   float64
 2   Spotify Playlist Count         4600 non-null   float64
 3   Spotify Playlist Reach         4600 non-null   float64
 4   YouTube Views                  4600 non-null   float64
 5   YouTube Likes                  4600 non-null   float64
 6   TikTok Posts                   4600 non-null   float64
 7   TikTok Likes                   4600 non-null   float64
 8   TikTok Views                   4600 non-null   float64
 9   YouTube Playlist Reach         4600 non-null   float64
 10  AirPlay Spins                  4600 non-null   float64
 11  Deezer Playlist Reach          4600 non-null   float64
 12  Pandora Streams                4600 non-null   f

<h3>Saving the data</h3>

In [17]:
df_spotify = pd.concat([df_categorical, df_continuous], axis=1)

In [18]:
df_spotify.to_csv("./data/train_spotify_data.csv", index=False)