In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:

# Edit options
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

In [None]:
dataset_path = "dataset/dataset-spotify-2023.csv"
data = pd.read_csv(dataset_path, encoding="latin-1")

In [None]:
data.head()

In [None]:
data.info(verbose=True, show_counts=True)

In [None]:
columns = ["danceability_%", "valence_%", "energy_%", "acousticness_%", "instrumentalness_%",
"liveness_%", "speechiness_%"]

# Rename selected columns
data = data.rename(columns={column: column.replace("_%", "") for column in columns})

In [None]:
# Print NaN count of `key` (95), no other column has missing data 
key_None_count = data["key"].isna().sum()
print("Key None count: ", key_None_count)

# Replace NaN values with Unspecified, it may be useful later on
data = data.replace(np.nan, "Unspecified")
# data["key"].transpose()

In [None]:
# Data is malformed, need to remove comma ,
data["in_deezer_playlists"] = data["in_deezer_playlists"].replace(",", "", regex=True)
data["in_shazam_charts"] = data["in_deezer_playlists"].replace(",", "", regex=True)

# Convert columns to int64
# streams, in_deezer_playlists, in_shazam_charts
data["in_deezer_playlists"] = data["in_deezer_playlists"].astype(int)
data["in_shazam_charts"] = data["in_shazam_charts"].astype(int)

# It overflowed with int so np.int64 to fit the whole numbers
data["streams"] = data["streams"].astype(np.int64)

# Wee see that `streams` is very large compared to to other data, next larger is `in_spotify_playlists`
# Add extra column with log value of streams
data["streams_log"] = np.log2(data["streams"])

# Issue with data, there is BPM110KeyAModeMajorDanceability53Valence75Energy69Acousticness7Instrumentalness0Liveness17Speechiness3
# Problematic record 576, just deleted it manually it was just noise in the data

In [None]:
# Print statistics, print them without the scientific notation
data.describe().apply(lambda s: s.apply(lambda x: format(x, "g"))).transpose()

In [None]:
# Select numeric columns
data_numeric = data.select_dtypes(exclude="object")

## Characterizing data columns

* track_name    (discrete, nominal) 
* artist(s)_name    (discrete, nominal) 
* artist_count  (discrete, ratio) 
* released_year (discrete, ordinal) 
* released_month    (discrete, ordinal) 
* released_day  (discrete, ordinal) 
* in_spotify_playlists  (discrete, ratio) 
* in_spotify_charts (discrete, ratio) 
* streams   (discrete, ratio) 
* in_apple_playlists    (discrete, ratio)
* in_apple_charts   (discrete, ratio) 
* in_deezer_playlists   (discrete, ratio) 
* in_deezer_charts  (discrete, ratio)
* in_shazam_charts  (discrete, ratio) 
* bpm   (continuous, ratio) 
* key   (discrete, nominal) 
* mode  (discrete, nominal) 
* danceability_%    (continuous, ratio) 
* valence_% (continuous, ratio) 
* energy_%  (continuous, ratio) 
* acousticness_%    (continuous, ratio) 
* instrumentalness_%    (continuous, ratio) 
* liveness_%    (continuous, ratio) 
* speechiness_% (continuous, ratio) 

## Useful code from Toolbox

There is a list of already implemented functions we can use from the ToolBox to perform Data Analysis.

* categoric2numeric.py : one-hot encoding
* similarity.py : similarity matrices
* statistics.py : chi-squared tests and other
* ex1_5_4.py : regression problem plot
* ex2_1_2.py : scatterplot
* ex2_1_3.py : PCAs
* ...


In [None]:
# Data Plotting
## Bar chart
## Histogram
## Violin Plots
## Pie charts
## Scatter plot (e.g. danceability_% vs. energy_%)
## Correlation Heatmap
## PCAs

In [None]:
# Correlation Heatmap 
columns = ["danceability_%", "valence_%", "energy_%", "acousticness_%", "instrumentalness_%",
"liveness_%", "speechiness_%"]
correlation_matrix = data[columns].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidth=0.5)
plt.title("Correlation Heatmap for _% attributes")
plt.show()

In [None]:
# Violin Plot

plt.figure(figsize=(6, 6))
sns.violinplot(y="bpm", data=data, orient="v")
plt.title("Violin Plot of bpm")
plt.show()