In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.linear_model import LinearRegression

In [3]:
dataset_path = "dataset/dataset-spotify-2023.csv"
data = pd.read_csv(dataset_path, encoding="latin-1")
columns = ["danceability_%", "valence_%", "energy_%", "acousticness_%", "instrumentalness_%",
"liveness_%", "speechiness_%"]
data = data.rename(columns={column: column.replace("_%", "") for column in columns})
key_None_count = data["key"].isna().sum()
in_shazam_charts_None_count = data["in_shazam_charts"].isna().sum()

print("`key` None count: ", key_None_count)
print("`in_shazam_charts` None count: ", in_shazam_charts_None_count)

# Replace NaN values with Unspecified, it may be useful later on
data = data.replace(np.nan, "Unavailable")

# Data is malformed, need to remove comma `,`
data["in_deezer_playlists"] = data["in_deezer_playlists"].replace(",", "", regex=True)
data["in_shazam_charts"] = data["in_deezer_playlists"].replace(",", "", regex=True)

# Convert columns to int64
# streams, in_deezer_playlists, in_shazam_charts
data["in_deezer_playlists"] = data["in_deezer_playlists"].astype(int)
data["in_shazam_charts"] = data["in_shazam_charts"].astype(int)

# Streams overflowed with int, so use np.int64 to fit the whole numbers
data["streams"] = data["streams"].astype(np.int64)

# Wee see that `streams` is very large compared to to other data, next larger is `in_spotify_playlists`
# Add extra column with log value of streams
# data["streams_log"] = np.log2(data["streams"])

`key` None count:  95
`in_shazam_charts` None count:  50


In [4]:
# One-hot encoding values
data = pd.get_dummies(data, columns=["key", "mode"], prefix=["key", "mode"])
data = data.applymap(lambda x: int(x) if isinstance(x, bool) else x)

In [5]:
# Select numeric columns
data_numeric = data.select_dtypes(exclude="object")
data_numeric.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   artist_count          952 non-null    int64
 1   released_year         952 non-null    int64
 2   released_month        952 non-null    int64
 3   released_day          952 non-null    int64
 4   in_spotify_playlists  952 non-null    int64
 5   in_spotify_charts     952 non-null    int64
 6   streams               952 non-null    int64
 7   in_apple_playlists    952 non-null    int64
 8   in_apple_charts       952 non-null    int64
 9   in_deezer_playlists   952 non-null    int64
 10  in_deezer_charts      952 non-null    int64
 11  in_shazam_charts      952 non-null    int64
 12  bpm                   952 non-null    int64
 13  danceability          952 non-null    int64
 14  valence               952 non-null    int64
 15  energy                952 non-null    int64
 16  acoustic

In [6]:

scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_numeric)
data_numeric = pd.DataFrame(data_normalized, columns=data_numeric.columns)

In [7]:
# Regression
# Target value is `streams`
X = torch.tensor(data_numeric.drop("streams", axis=1).values, dtype=torch.float32)
Y = torch.tensor(data_numeric["streams"].values, dtype=torch.float32).view(-1, 1)

In [8]:
list(data_numeric.head())

['artist_count',
 'released_year',
 'released_month',
 'released_day',
 'in_spotify_playlists',
 'in_spotify_charts',
 'streams',
 'in_apple_playlists',
 'in_apple_charts',
 'in_deezer_playlists',
 'in_deezer_charts',
 'in_shazam_charts',
 'bpm',
 'danceability',
 'valence',
 'energy',
 'acousticness',
 'instrumentalness',
 'liveness',
 'speechiness',
 'key_A',
 'key_A#',
 'key_B',
 'key_C#',
 'key_D',
 'key_D#',
 'key_E',
 'key_F',
 'key_F#',
 'key_G',
 'key_G#',
 'key_Unavailable',
 'mode_Major',
 'mode_Minor']

In [9]:


# Selecting features (X) and target variable (y)
composition_features = ['bpm', 'mode_Major', 'mode_Minor', 'key_A',
 'key_A#',
 'key_B',
 'key_C#',
 'key_D',
 'key_D#',
 'key_E',
 'key_F',
 'key_F#',
 'key_G',
 'key_G#']
musical_features = ['danceability',
 'valence',
 'energy',
 'acousticness',
 'instrumentalness',
 'liveness',
 'speechiness']
m_X = torch.tensor(
    data_numeric[musical_features].values, dtype=torch.float32)
c_X = torch.tensor(
    data_numeric[composition_features].values, dtype=torch.float32)
Y = torch.tensor(data_numeric["streams"].values,
                 dtype=torch.float32).view(-1, 1)

# Splitting the dataset into training and testing sets
m_X_train, m_X_test, m_y_train, m_y_test = train_test_split(m_X, Y, test_size=0.2, random_state=42)
c_X_train, c_X_test, c_y_train, c_y_test = train_test_split(
    c_X, Y, test_size=0.2, random_state=42)

# Creating a linear regression model
composition_model = LinearRegression()
musical_model = LinearRegression()

# Training the model
musical_model.fit(m_X_train, m_y_train)
composition_model.fit(c_X_train, c_y_train)

# Making predictions on the test set
m_y_pred = musical_model.predict(m_X_test)
c_y_pred = composition_model.predict(c_X_test)

# Evaluating the model
m_mse = mean_squared_error(m_y_test, m_y_pred)
print(f'Musical Mean Squared Error: {m_mse}')
c_mse = mean_squared_error(c_y_test, c_y_pred)
print(f'Composition Mean Squared Error: {c_mse}')


#coefficients and intercept
print('Musical Features Model')
print('Coefficients:', musical_model.coef_)
print('Intercept:', musical_model.intercept_)
print('Composition Features Model')
print('Coefficients:', composition_model.coef_)
print('Intercept:', composition_model.intercept_)

Musical Mean Squared Error: 0.771190881729126
Musical Mean Squared Error: 0.7707972526550293
Coefficients: [[-0.10132175  0.03943747 -0.02285512 -0.04396841 -0.05731165 -0.08007196
  -0.12435336]]
Intercept: [0.01531061]
Coefficients: [[ 0.0047127   0.01319228 -0.01319228 -0.05152893  0.0167727  -0.02853826
   0.04398692  0.01416882  0.01422358 -0.01544834 -0.02877389 -0.01786382
  -0.03688407 -0.02123293]]
Intercept: [0.01893817]


In [10]:

class LinearRegression(nn.Module):
    def __init__(self, input_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

input_size = X.shape[1] 
model = LinearRegression(input_size=input_size)

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
num_epochs = 1000
max_gradient_norm = 5.0

for epoch in range(num_epochs):
    outputs = model(X)
    loss = criterion(outputs, Y)

    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

predicted = model(X).detach().numpy()

Epoch [100/1000], Loss: 0.3143
Epoch [200/1000], Loss: 0.2787
Epoch [300/1000], Loss: 0.2688
Epoch [400/1000], Loss: 0.2648
Epoch [500/1000], Loss: 0.2629
Epoch [600/1000], Loss: 0.2618
Epoch [700/1000], Loss: 0.2611
Epoch [800/1000], Loss: 0.2607
Epoch [900/1000], Loss: 0.2605
Epoch [1000/1000], Loss: 0.2603


In [11]:
# Classification

# Classify songs based on chart performance, `hit` songs on specific threshold
# Let's define the `hit` threshold
# Percentile analysis
# You could define a hit as songs that fall within the top 10% of stream counts in your dataset.
threshold = np.percentile(data["streams"], 80)
data_numeric["hit"] = (data_numeric["streams"] >= threshold).astype(int)

# Note:
# Lyrical Content (this requires scrapping)

In [12]:
# No this doesn't make sense

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(classifier, X_train, y_train, cv=cv, scoring="accuracy")

print("Cross-Validation Scores:", cv_scores)
print(f"Average Cross-Validation Accuracy: {cv_scores.mean():.2f}")

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

NameError: name 'y' is not defined