In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import csv

In [2]:
sp_2023 = pd.read_csv("./data/spotify-2023.csv", encoding='latin1')
sp_2024 = pd.read_csv("./data/Most Streamed Spotify Songs 2024.csv", encoding='latin1')

In [3]:
sp_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 

I can see that all number columns are objects, not floats or integers. In order to have right approach, they need to be converted into integers. Ive also see that 'TIDAL Popularity' column have all values as null.

In [4]:
sp_2024.head()

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,...,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,...,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,...,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,...,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,...,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,...,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


In [5]:
sp_2024['Track Score']=sp_2024['Track Score'].astype(int)

In [6]:
sp_2024['Spotify Streams'] = sp_2024['Spotify Streams'].str.replace(',','').astype(float)

In [7]:
columns_to_convert = [
    'Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach',
    'YouTube Views', 'YouTube Likes', 'YouTube Playlist Reach',
    'TikTok Posts', 'TikTok Likes', 'TikTok Views'
]

# Convert the specified columns to integer type
sp_2024[columns_to_convert] = sp_2024[columns_to_convert].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

print(sp_2024.dtypes)

Track                          object
Album Name                     object
Artist                         object
Release Date                   object
ISRC                           object
All Time Rank                  object
Track Score                     int64
Spotify Streams                 int64
Spotify Playlist Count          int64
Spotify Playlist Reach          int64
Spotify Popularity            float64
YouTube Views                   int64
YouTube Likes                   int64
TikTok Posts                    int64
TikTok Likes                    int64
TikTok Views                    int64
YouTube Playlist Reach          int64
Apple Music Playlist Count    float64
AirPlay Spins                  object
SiriusXM Spins                 object
Deezer Playlist Count         float64
Deezer Playlist Reach          object
Amazon Playlist Count         float64
Pandora Streams                object
Pandora Track Stations         object
Soundcloud Streams             object
Shazam Count

In [8]:
sp_2024.columns

Index(['Track', 'Album Name', 'Artist', 'Release Date', 'ISRC',
       'All Time Rank', 'Track Score', 'Spotify Streams',
       'Spotify Playlist Count', 'Spotify Playlist Reach',
       'Spotify Popularity', 'YouTube Views', 'YouTube Likes', 'TikTok Posts',
       'TikTok Likes', 'TikTok Views', 'YouTube Playlist Reach',
       'Apple Music Playlist Count', 'AirPlay Spins', 'SiriusXM Spins',
       'Deezer Playlist Count', 'Deezer Playlist Reach',
       'Amazon Playlist Count', 'Pandora Streams', 'Pandora Track Stations',
       'Soundcloud Streams', 'Shazam Counts', 'TIDAL Popularity',
       'Explicit Track'],
      dtype='object')

In [9]:
#Drop columns that we won't use
columns_to_drop = [
    'Apple Music Playlist Count', 
    'AirPlay Spins', 
    'SiriusXM Spins', 
    'Deezer Playlist Count', 
    'Deezer Playlist Reach',
    'Amazon Playlist Count',
    'Pandora Streams',
    'Pandora Track Stations',
    'Soundcloud Streams',
    'Shazam Counts',
    'TIDAL Popularity'
]

# Drop the specified columns
sp_2024 = sp_2024.drop(columns=columns_to_drop)

# Verify the updated DataFrame
print(sp_2024.head())  

                        Track                    Album Name          Artist  \
0         MILLION DOLLAR BABY  Million Dollar Baby - Single   Tommy Richman   
1                 Not Like Us                   Not Like Us  Kendrick Lamar   
2  i like the way you kiss me    I like the way you kiss me         Artemas   
3                     Flowers              Flowers - Single     Miley Cyrus   
4                     Houdini                       Houdini          Eminem   

  Release Date          ISRC All Time Rank  Track Score  Spotify Streams  \
0    4/26/2024  QM24S2402528             1          725        390470936   
1     5/4/2024  USUG12400910             2          545        323703884   
2    3/19/2024  QZJ842400387             3          538        601309283   
3    1/12/2023  USSM12209777             4          444       2031280633   
4    5/31/2024  USUG12403398             5          423        107034922   

   Spotify Playlist Count  Spotify Playlist Reach  Spotify Popularit

In [10]:
sp_2024.shape

(4600, 18)

In [11]:
#dates are in US format
sp_2024['Release Date'].unique()

array(['4/26/2024', '5/4/2024', '3/19/2024', ..., '10/31/2018',
       '11/8/2016', '4/11/2017'], dtype=object)

In [12]:
df = pd.DataFrame(sp_2024)

# Convert the 'Date' column to datetime format (American MM/DD/YYYY)
df['Release Date'] = pd.to_datetime(df['Release Date'], format='%m/%d/%Y')

# Convert to European format (DD/MM/YYYY)
df['Release Date'] = df['Release Date'].dt.strftime('%d/%m/%Y')

# Display the result
print(df)

                                Track                        Album Name  \
0                 MILLION DOLLAR BABY      Million Dollar Baby - Single   
1                         Not Like Us                       Not Like Us   
2          i like the way you kiss me        I like the way you kiss me   
3                             Flowers                  Flowers - Single   
4                             Houdini                           Houdini   
...                               ...                               ...   
4595                For the Last Time                 For the Last Time   
4596                 Dil Meri Na Sune  Dil Meri Na Sune (From "Genius")   
4597            Grace (feat. 42 Dugg)                           My Turn   
4598              Nashe Si Chadh Gayi             November Top 10 Songs   
4599  Me Acostumbre (feat. Bad Bunny)   Me Acostumbre (feat. Bad Bunny)   

              Artist Release Date          ISRC All Time Rank  Track Score  \
0      Tommy Richman 

In [13]:
sp_2024['Album Name'].unique()

array(['Million Dollar Baby - Single', 'Not Like Us',
       'I like the way you kiss me', ...,
       'Dil Meri Na Sune (From "Genius")', 'November Top 10 Songs',
       'Me Acostumbre (feat. Bad Bunny)'], dtype=object)

In [14]:
sp_2024['Artist'].unique()

array(['Tommy Richman', 'Kendrick Lamar', 'Artemas', ..., 'Kerim Araz',
       'Jaques Raupï¿', 'BUSHIDO ZHO'], dtype=object)

In [15]:
print(f"Unique: {sp_2024['ISRC'].nunique()}",f"Unique: {sp_2024['ISRC'].count()}")

Unique: 4598 Unique: 4600


In [16]:
sp_2024 = sp_2024.drop_duplicates()
sp_2024.shape

(4598, 18)

In [17]:
import pandas as pd

# Convert sp_2024 to a DataFrame if it's a dictionary
if isinstance(sp_2024, dict):
    df = pd.DataFrame(sp_2024)
else:
    df = sp_2024

# Function to format names
def format_name(name):
    # Ensure the name is a string to avoid errors with NaN values
    if isinstance(name, str):
        return name.title()  # Convert to title case
    return name  # Return as-is for non-string values (e.g., NaN)

# Check the column names
print("Column names in the dataset:", df.columns)

# Replace 'Artist' with the actual column name containing the names
column_name = 'Artist'
if column_name in df.columns:
    # Apply formatting if the column exists
    df[column_name] = df[column_name].apply(format_name)
    print(f"Cleaned '{column_name}' column:")
    print(df[column_name])  # Display the cleaned column
else:
    print(f"Column '{column_name}' not found in the dataset.")


Column names in the dataset: Index(['Track', 'Album Name', 'Artist', 'Release Date', 'ISRC',
       'All Time Rank', 'Track Score', 'Spotify Streams',
       'Spotify Playlist Count', 'Spotify Playlist Reach',
       'Spotify Popularity', 'YouTube Views', 'YouTube Likes', 'TikTok Posts',
       'TikTok Likes', 'TikTok Views', 'YouTube Playlist Reach',
       'Explicit Track'],
      dtype='object')
Cleaned 'Artist' column:
0        Tommy Richman
1       Kendrick Lamar
2              Artemas
3          Miley Cyrus
4               Eminem
             ...      
4595       $Uicideboy$
4596        Atif Aslam
4597          Lil Baby
4598      Arijit Singh
4599         Arcï¿½Ï¿½
Name: Artist, Length: 4598, dtype: object


In [18]:
#Change the funny characters name into readible format
sp_2024['Artist'].replace({
    'Arcï¿½ï¿½': 'Arcangel',
    'Bad Bï¿½nny': 'Bad Bunny'
}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sp_2024['Artist'].replace({


In [19]:
sp_2024['Track'].unique()

array(['MILLION DOLLAR BABY', 'Not Like Us', 'i like the way you kiss me',
       ..., 'Grace (feat. 42 Dugg)', 'Nashe Si Chadh Gayi',
       'Me Acostumbre (feat. Bad Bunny)'], dtype=object)

In [20]:
sp_2024['Track Score'].value_counts()

Track Score
21     327
20     285
22     267
23     240
24     211
      ... 
119      1
132      1
145      1
146      1
147      1
Name: count, Length: 205, dtype: int64

In [21]:
sp_2024['All Time Rank'].value_counts()

All Time Rank
355      2
1,103    2
1,807    2
2,192    2
626      2
        ..
4,585    1
4,575    1
4,571    1
4,591    1
4,553    1
Name: count, Length: 4577, dtype: int64

In [22]:
sp_2024['All Time Rank'].unique()

array(['1', '2', '3', ..., '4,571', '4,591', '4,593'], dtype=object)

In [23]:
sp_2024.sort_values('All Time Rank')

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725,390470936,0,0,92.0,0,0,0,0,0,0,0
1010,LEATHER,LEATHER,Makaela,1/27/2023,QZJG52355895,1000,47,1696,0,0,,0,0,0,0,0,0,1
1006,Lean Wit Me,Lean Wit Me,Juice Wrld,5/22/2018,USUG11800945,1001,47,880781313,0,0,72.0,0,0,0,0,0,0,1
1007,HOTEL LOBBY (Unc & Phew),HOTEL LOBBY (Unc and Phew),Quavo,5/20/2022,USUG12202512,1002,47,399707517,0,0,69.0,0,0,0,0,0,0,1
1011,Panini,Panini,Lil Nas X,6/20/2019,USSM11903948,1003,47,741766886,0,0,62.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,PICK UP THE PHONE (feat. Nate Dogg),PICK UP THE PHONE (feat. Nate Dogg),Pawsa,5/31/2024,GBCEN2400112,995,48,2284977,0,0,22.0,0,0,0,0,0,0,1
1001,Tomorrow 2 (with Cardi B),Tomorrow 2 (with Cardi B),Glorilla,9/23/2022,USUM72217666,996,48,157806193,0,0,64.0,0,0,0,0,0,0,1
1002,Se Preparï¿,Se Preparï¿,Ozuna,8/10/2017,USXDR1700552,997,48,846909112,0,0,73.0,0,0,0,0,0,0,0
1003,Lost on You,Lost On You,Lp,3/24/2016,QMRSZ1501406,998,48,837042727,0,0,70.0,0,0,0,0,0,0,0


In [24]:
sp_2024.sort_values('Track Score')

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Explicit Track
4599,Me Acostumbre (feat. Bad Bunny),Me Acostumbre (feat. Bad Bunny),Arcï¿½Ï¿½,4/11/2017,USB271700107,4593,19,255740653,0,0,64.0,0,0,0,0,0,0,1
4563,Pa Mi,Pa Mi,Dalex,9/20/2018,QM9WM1800095,4563,19,50739695,0,0,34.0,0,0,0,0,0,0,1
4564,Tera Fitoor,"Tera Fitoor (From ""Genius"")",Arijit Singh,7/12/2018,INT101800121,4545,19,108748134,0,0,68.0,0,0,114,0,0,0,0
4533,ARRANCARMELO,ARRANCARMELO,Wos,4/6/2022,UYB282206048,4504,19,223426639,0,0,63.0,0,0,0,0,0,0,0
4534,"Mast Malang Jhoom (From ""Bade Miyan Chote Miya...","Mast Malang Jhoom (From ""Bade Miyan Chote Miya...",Vishal Mishra,3/1/2024,INZ031414619,4506,19,0,0,0,,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423,107034922,0,0,88.0,0,0,0,0,0,0,1
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444,2031280633,0,0,85.0,0,0,0,0,0,0,0
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538,601309283,0,0,92.0,0,0,0,0,0,0,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545,323703884,0,0,92.0,0,0,0,0,0,0,1


In [25]:
sp_2024['Explicit Track'].value_counts()

Explicit Track
0    2947
1    1651
Name: count, dtype: int64

In [26]:
# Check NaN values in each column.
for column in sp_2024.columns:
    print(column, sp_2024[column].isnull().sum())

Track 0
Album Name 0
Artist 5
Release Date 0
ISRC 0
All Time Rank 0
Track Score 0
Spotify Streams 0
Spotify Playlist Count 0
Spotify Playlist Reach 0
Spotify Popularity 804
YouTube Views 0
YouTube Likes 0
TikTok Posts 0
TikTok Likes 0
TikTok Views 0
YouTube Playlist Reach 0
Explicit Track 0


In [27]:
# Drop a row if ALL columns in a row are NaN
sp_2024.dropna(how="all")

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725,390470936,0,0,92.0,0,0,0,0,0,0,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545,323703884,0,0,92.0,0,0,0,0,0,0,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538,601309283,0,0,92.0,0,0,0,0,0,0,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444,2031280633,0,0,85.0,0,0,0,0,0,0,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423,107034922,0,0,88.0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,For the Last Time,For the Last Time,$Uicideboy$,9/5/2017,QM8DG1703420,4585,19,305049963,0,0,71.0,0,0,0,0,0,0,1
4596,Dil Meri Na Sune,"Dil Meri Na Sune (From ""Genius"")",Atif Aslam,7/27/2018,INT101800122,4575,19,52282360,0,0,56.0,0,0,0,0,0,0,0
4597,Grace (feat. 42 Dugg),My Turn,Lil Baby,2/28/2020,USUG12000043,4571,19,189972685,0,0,65.0,0,0,0,0,0,0,1
4598,Nashe Si Chadh Gayi,November Top 10 Songs,Arijit Singh,11/8/2016,INY091600067,4591,19,145467020,0,0,66.0,0,0,0,0,0,0,0


In [28]:
sp_2024.drop_duplicates()

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725,390470936,0,0,92.0,0,0,0,0,0,0,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545,323703884,0,0,92.0,0,0,0,0,0,0,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538,601309283,0,0,92.0,0,0,0,0,0,0,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444,2031280633,0,0,85.0,0,0,0,0,0,0,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423,107034922,0,0,88.0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,For the Last Time,For the Last Time,$Uicideboy$,9/5/2017,QM8DG1703420,4585,19,305049963,0,0,71.0,0,0,0,0,0,0,1
4596,Dil Meri Na Sune,"Dil Meri Na Sune (From ""Genius"")",Atif Aslam,7/27/2018,INT101800122,4575,19,52282360,0,0,56.0,0,0,0,0,0,0,0
4597,Grace (feat. 42 Dugg),My Turn,Lil Baby,2/28/2020,USUG12000043,4571,19,189972685,0,0,65.0,0,0,0,0,0,0,1
4598,Nashe Si Chadh Gayi,November Top 10 Songs,Arijit Singh,11/8/2016,INY091600067,4591,19,145467020,0,0,66.0,0,0,0,0,0,0,0


In [35]:
#1. To check if data was correctly cleaned:
#check head and tails to make sure that the data is correct
#sample method-> check random samples of the dataset to check if there are problems:
#methods:
#data.sample(n=XXX) --> n can be equal to the size you want
#data.sample(frac=0.1) --> fraction of original data frame
#run it 5 times to check that your data is correctly cleaned

sp_2024.sample(n=10)


Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Explicit Track
3287,Die For You (Remix Acapella),Die For You (Remix Acapella) - Single,The Weeknd,3/2/2023,USUG12301604,3281,24,0,0,0,,0,0,0,0,0,0,0
1733,ýýýýýýýýýýýýýýýýýý ýýýýýý ýýýýýý ýýýýýýýýýýýýý...,ýýýýýýýýýýýýýýýýýý ýýýýýý ýýýýýý ýýýýýýýýýýýýý...,Md Faisal Ahmed,7/9/2023,QZLQR2199831,1731,35,2274,0,0,,0,0,0,0,0,0,0
271,Moscow Mule,Un Verano Sin Ti,Bad Bunny,5/6/2022,QM6MZ2214875,272,94,1108031580,0,0,77.0,0,0,0,0,0,0,1
4147,No Switch,Colors (Deluxe),Youngboy Never Broke Again,1/21/2022,USAT22200200,4119,20,53849840,0,0,60.0,0,0,0,0,0,0,1
2381,Lisa,Lisa,Young Miko,3/3/2023,QZXD92300001,2365,29,209988103,0,0,67.0,0,0,0,0,0,0,1
2557,Talk talk,BRAT,Charli Xcx,6/7/2024,USAT22401345,2540,28,6446985,830,0,,0,0,0,0,0,0,0
3395,FourFiveSeconds,FourFiveSeconds,Rihanna,1/24/2015,USJMT1500001,3374,23,1134470968,0,0,72.0,0,0,0,0,0,0,0
2275,If I Didn't Love You,If I Didn't Love You,Jason Aldean,7/23/2021,QMRSZ2101546,2268,30,143447350,0,0,60.0,0,0,0,0,0,0,0
195,Whatever,Whatever,Kygo,1/19/2024,USRC12302637,196,109,222354971,0,0,83.0,0,0,0,0,0,0,0
77,Sprinter,Sprinter - Single,Dave,6/1/2023,GBUM72305159,78,168,692512930,0,0,78.0,0,0,0,0,0,0,1


In [36]:
sp_2024.sample(frac=0.1)

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Explicit Track
3771,Los Botones Azules,Mi Vida En Un Cigarro 2,Junior H,2/11/2022,USWL12200130,3758,22,409608018,0,0,70.0,0,0,0,0,0,0,0
1034,Arcade,Arcade,Duncan Laurence,3/7/2019,NL1TK1900001,1029,47,1184536529,0,0,69.0,0,0,0,0,0,0,0
4199,Soltera - Remix,Soltera (Remix),Lunay,5/10/2019,USA2P1927909,4180,20,960753774,0,0,67.0,0,0,0,0,0,0,0
369,Where Are You Now,Where Are You Now,Lost Frequencies,7/30/2021,BEHP42100067,370,79,1223329465,0,0,74.0,0,0,0,0,0,0,0
950,POP!,IM NAYEON,Nayeon,6/24/2022,US5TA2200041,946,49,296828933,0,0,69.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,Holy (feat. Chance The Rapper),Holy,Justin Bieber,9/18/2020,USUM72017013,950,49,787546548,0,0,63.0,0,0,0,0,0,0,0
3538,"Hey Mama (feat. Nicki Minaj, Bebe Rexha & Afro...","Hey Mama (feat. Nicki Minaj, Bebe Rexha & Afro...",Boaz Van De Beatz,6/5/2015,GB28K1500038,3506,23,24407534,0,0,29.0,0,0,0,0,0,0,0
3504,Mi Morenita,Mi Morenita - Single,Grupo Marca Registrada,5/24/2024,USA2P2432334,3494,23,8104357,456,0,68.0,0,0,0,0,0,0,0
4290,Lose It,Lose It,Through Fire,2/14/2023,USYFZ2327704,4273,20,301362,311,0,18.0,0,0,16,0,0,0,0


2.Missing data:
delete if there is no artist, no matter if you have the name of the song. do not replace it manually
do not change special characters, leave it as it is
every change that is done manually, is not worth it.


3. How to know if the variables selected are correct, and not just make it look like a random selection:
you can use a correlation matrix with variables with numbers, be careful bc if numbers mean a categorical answer, then it is going to be a mess up
Use pearson correlation to spotify streams, youtube like, tiktok likes
Use pearson rank correlation and compare spotify streams with all time rank
all time rank vs youtube likes, and all time rank with tiktok likes (3 pearson rank correlations per separate)
this is only recommendedif the data has a normal distribution, in case not, it might fail.

In [29]:
sp_2024.to_csv('cleaned_data_ke.csv', index=False)