In [1]:
import pandas as pd
import numpy as np

In [2]:
sf = pd.read_csv('spotify_songs_dataset.csv')

In [3]:
sf.head(4)

Unnamed: 0,song_id,song_title,artist,album,genre,release_date,duration,popularity,stream,language,explicit_content,label,composer,producer,collaboration
0,SP0001,Space executive series.,Sydney Clark,What.,Electronic,1997-11-08,282.0,42,35055874,English,Yes,Def Jam,Amy Hatfield,Jeffrey Weaver,
1,SP0002,Price last painting.,Connor Peters DDS,Nature politics.,Electronic,2015-05-10,127.0,50,9249527,English,Yes,Universal Music,Jason Gregory,Kenneth White,
2,SP0003,Piece.,Anna Keith,Visit.,Pop,2024-07-08,,10,76669110,English,Yes,Universal Music,Rachel Lopez,Jason Barnes,
3,SP0004,Power industry your.,Zachary Simpson,Behavior evening.,Hip-Hop,2022-08-15,214.0,86,34732016,English,No,Sony Music,Thomas Li,Mrs. Becky Palmer,


In [4]:
sf = sf.drop('collaboration', axis=1)

In [5]:
if 'genre' in sf.columns:
    sf['genre'] = sf['genre'].fillna('Unknown')
else:
    print("Column 'genre' not found in DataFrame.")


In [6]:
if 'track_name' in sf.columns:
    sf.dropna(subset=['track_name'], inplace=True)
else:
    print("Column 'track_name' not found in DataFrame.")


Column 'track_name' not found in DataFrame.


In [7]:
# Standardize numerical fields if necessary
if 'duration_ms' in sf.columns:
    sf['duration_sec'] = sf['duration_ms'] / 1000  # Convert duration from milliseconds to seconds


In [8]:
# Create a new column for song length category
sf['length_category'] = pd.cut(sf['duration'], bins=[0, 180, 300, 600], labels=['Short', 'Medium', 'Long'])

# Create popularity tiers (e.g., Low, Medium, High)
sf['popularity_tier'] = pd.cut(sf['popularity'], bins=[0, 50, 75, 100], labels=['Low', 'Medium', 'High'])


In [9]:
# Calculate the median of the 'duration' column
md = sf['duration'].median()
# Fill NaN values in 'duration' safely using .loc[]
sf.loc[:, 'duration'] = sf['duration'].fillna(md)


In [10]:
# Fill NaN values with the mode of the 'language' column
mode_value = sf['language'].mode()[0]  # Get the mode (most frequent value)
sf['language'] = sf['language'].fillna(mode_value)  # Assign the filled values back

In [11]:
# Fill NaN values with the mode of the 'length_category' column
x = sf['length_category'].mode()[0]  # Get the mode (most frequent value)
sf['length_category'] = sf['length_category'].fillna(x)  # Assign the filled values back

In [12]:
# Convert the 'Date' column to datetime
sf['release_date'] = pd.to_datetime(sf['release_date'], format='%Y-%m-%d')

# Extract year, month (as full name), and day into separate columns
sf['release_Year'] = sf['release_date'].dt.year
sf['release_Month'] = sf['release_date'].dt.strftime('%B')  # Full month name
sf['release_Day'] = sf['release_date'].dt.day

# Drop the original Date column if not needed
sf = sf.drop('release_date', axis=1)

In [13]:
sf.head()

Unnamed: 0,song_id,song_title,artist,album,genre,duration,popularity,stream,language,explicit_content,label,composer,producer,length_category,popularity_tier,release_Year,release_Month,release_Day
0,SP0001,Space executive series.,Sydney Clark,What.,Electronic,282.0,42,35055874,English,Yes,Def Jam,Amy Hatfield,Jeffrey Weaver,Medium,Low,1997,November,8
1,SP0002,Price last painting.,Connor Peters DDS,Nature politics.,Electronic,127.0,50,9249527,English,Yes,Universal Music,Jason Gregory,Kenneth White,Short,Low,2015,May,10
2,SP0003,Piece.,Anna Keith,Visit.,Pop,240.0,10,76669110,English,Yes,Universal Music,Rachel Lopez,Jason Barnes,Medium,Low,2024,July,8
3,SP0004,Power industry your.,Zachary Simpson,Behavior evening.,Hip-Hop,214.0,86,34732016,English,No,Sony Music,Thomas Li,Mrs. Becky Palmer,Medium,High,2022,August,15
4,SP0005,Food animal second.,Christopher Mcgee,Front.,Pop,273.0,63,96649372,English,Yes,Def Jam,Adam Wagner,Beverly Baker,Medium,Medium,2023,March,5


In [14]:
# Display basic info and check for missing values
print(sf.info())
print(sf.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   song_id           50000 non-null  object  
 1   song_title        50000 non-null  object  
 2   artist            50000 non-null  object  
 3   album             50000 non-null  object  
 4   genre             50000 non-null  object  
 5   duration          50000 non-null  float64 
 6   popularity        50000 non-null  int64   
 7   stream            50000 non-null  int64   
 8   language          50000 non-null  object  
 9   explicit_content  50000 non-null  object  
 10  label             50000 non-null  object  
 11  composer          50000 non-null  object  
 12  producer          50000 non-null  object  
 13  length_category   50000 non-null  category
 14  popularity_tier   50000 non-null  category
 15  release_Year      50000 non-null  int32   
 16  release_Month     5000

In [15]:
# Save the cleaned data to a new CSV
# sf.to_csv('cleaned_spotify_data.csv', index=False)