In [68]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn

In [69]:
dataset_path = "dataset/dataset-spotify-2023.csv"
data = pd.read_csv(dataset_path, encoding="latin-1")
columns = ["danceability_%", "valence_%", "energy_%", "acousticness_%", "instrumentalness_%",
"liveness_%", "speechiness_%"]
data = data.rename(columns={column: column.replace("_%", "") for column in columns})
key_None_count = data["key"].isna().sum()
in_shazam_charts_None_count = data["in_shazam_charts"].isna().sum()

print("`key` None count: ", key_None_count)
print("`in_shazam_charts` None count: ", in_shazam_charts_None_count)

# Replace NaN values with Unspecified, it may be useful later on
data = data.replace(np.nan, "Unavailable")

# Data is malformed, need to remove comma `,`
data["in_deezer_playlists"] = data["in_deezer_playlists"].replace(",", "", regex=True)
data["in_shazam_charts"] = data["in_deezer_playlists"].replace(",", "", regex=True)

# Convert columns to int64
# streams, in_deezer_playlists, in_shazam_charts
data["in_deezer_playlists"] = data["in_deezer_playlists"].astype(int)
data["in_shazam_charts"] = data["in_shazam_charts"].astype(int)

# Streams overflowed with int, so use np.int64 to fit the whole numbers
data["streams"] = data["streams"].astype(np.int64)

# Wee see that `streams` is very large compared to to other data, next larger is `in_spotify_playlists`
# Add extra column with log value of streams
# data["streams_log"] = np.log2(data["streams"])

`key` None count:  95
`in_shazam_charts` None count:  50


In [70]:
# One-hot encoding values
data = pd.get_dummies(data, columns=["key", "mode"], prefix=["key", "mode"])
data = data.applymap(lambda x: int(x) if isinstance(x, bool) else x)

In [71]:
# Select numeric columns
data_numeric = data.select_dtypes(exclude="object")
data_numeric.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   artist_count          952 non-null    int64
 1   released_year         952 non-null    int64
 2   released_month        952 non-null    int64
 3   released_day          952 non-null    int64
 4   in_spotify_playlists  952 non-null    int64
 5   in_spotify_charts     952 non-null    int64
 6   streams               952 non-null    int64
 7   in_apple_playlists    952 non-null    int64
 8   in_apple_charts       952 non-null    int64
 9   in_deezer_playlists   952 non-null    int64
 10  in_deezer_charts      952 non-null    int64
 11  in_shazam_charts      952 non-null    int64
 12  bpm                   952 non-null    int64
 13  danceability          952 non-null    int64
 14  valence               952 non-null    int64
 15  energy                952 non-null    int64
 16  acoustic

In [72]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_numeric)
data_numeric = pd.DataFrame(data_normalized, columns=data_numeric.columns)

In [73]:
# Regression
# Target value is `streams`
X = torch.tensor(data_numeric.drop("streams", axis=1).values, dtype=torch.float32)
Y = torch.tensor(data_numeric["streams"].values, dtype=torch.float32).view(-1, 1)

In [74]:

class LinearRegression(nn.Module):
    def __init__(self, input_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

input_size = X.shape[1] 
model = LinearRegression(input_size=input_size)

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
num_epochs = 1000
max_gradient_norm = 5.0

for epoch in range(num_epochs):
    outputs = model(X)
    loss = criterion(outputs, Y)

    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

predicted = model(X).detach().numpy()

Epoch [100/1000], Loss: 0.2975
Epoch [200/1000], Loss: 0.2689
Epoch [300/1000], Loss: 0.2627
Epoch [400/1000], Loss: 0.2610
Epoch [500/1000], Loss: 0.2604
Epoch [600/1000], Loss: 0.2602
Epoch [700/1000], Loss: 0.2601
Epoch [800/1000], Loss: 0.2601
Epoch [900/1000], Loss: 0.2601
Epoch [1000/1000], Loss: 0.2601


In [75]:
# Classification

# Classify songs based on chart performance, `hit` songs on specific threshold
# Let's define the `hit` threshold
# Percentile analysis
# You could define a hit as songs that fall within the top 10% of stream counts in your dataset.
threshold = np.percentile(data["streams"], 80)
data_numeric["hit"] = (data_numeric["streams"] >= threshold).astype(int)

# Note:
# Lyrical Content (this requires scrapping)

In [76]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [77]:
# No this doesn't make sense

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(classifier, X_train, y_train, cv=cv, scoring="accuracy")

print("Cross-Validation Scores:", cv_scores)
print(f"Average Cross-Validation Accuracy: {cv_scores.mean():.2f}")

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Cross-Validation Scores: [1. 1. 1. 1. 1.]
Average Cross-Validation Accuracy: 1.00
Test Set Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       191

    accuracy                           1.00       191
   macro avg       1.00      1.00      1.00       191
weighted avg       1.00      1.00      1.00       191

