In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
from torch_geometric.data import HeteroData

In [None]:
# Using GPU?
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
music = pd.read_csv("music.csv")
music.head()

In [None]:
print(music.shape[0])
music.info()

### Data Cleaning

In [None]:
print(music[music.isna().any(axis=1)].shape[0])
music[music.isna().any(axis=1)].head(5)

In [None]:
music = music.drop(columns=['artist_mbtags','song.hotttnesss'])
music[music.isna().any(axis=1)]

In [None]:
music = music.dropna()
music[music.isna().any(axis=1)]
print(music.shape[0])
music.describe()

In [None]:
music.info()

### Model Selection

In [None]:
drop_cols = ['artist.id', 'artist.name', 'location', 'release.id', 'release.name', 
             'similar', 'song.id', 'title', 'terms']
music_numeric = music.drop(columns=drop_cols)

In [None]:
# encode terms to numeric
label_encoder = LabelEncoder()
music_numeric['genre'] = label_encoder.fit_transform(music_numeric['genre'])

X = music_numeric.drop(columns=['genre'])
y = music_numeric['genre']

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Fit the transformer only on the training set 
transformer = StandardScaler().fit(X_train)
X_train_norm = pd.DataFrame(transformer.transform(X_train), columns = X_train.columns)

# Apply the same transformation to the testing set
X_test_norm = transformer.transform(X_test)

# Range of alphas (follow log: 0.001 - 1000 normally)
alphas = np.logspace(-3, 3, 100)

# Lasso Cross-validation, 10 folds
lassocv = LassoCV(cv=10, 
                  alphas=alphas, 
                  max_iter=10000, 
                  tol=1e-4, 
                  random_state=404)
lassocv.fit(X_train_norm, y_train)

# mean MSE across folds for each alpha
mse_mean = np.mean(lassocv.mse_path_, axis=1)

# MSE versus alphas
plt.plot(lassocv.alphas_, mse_mean, linestyle='--')
plt.scatter(lassocv.alpha_, mse_mean[np.argmin(mse_mean)], label='optimal alpha')
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Mean Squared Error')
plt.title('MSE vs Alpha')
plt.legend()
plt.show()

print('Optimal alpha (Lasso):', lassocv.alpha_)

In [None]:
# Lasso Coefficients
lasso_coefficients = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lassocv.coef_
})

nonzero_features = lasso_coefficients[lasso_coefficients['coefficient'] != 0]
nonzero_features

### Logistic Regression (Baseline)

In [None]:
select_col = lasso_coefficients[lasso_coefficients['coefficient'] != 0]['feature'].to_list()
X = music_numeric[select_col]
y = music_numeric['terms']

X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

transformer = StandardScaler().fit(X_train)
X_train_norm = pd.DataFrame(transformer.transform(X_train), columns = X_train.columns)

# Apply the same transformation to the testing set
X_test_norm = transformer.transform(X_test)

log_reg = LogisticRegression(random_state=0, max_iter=1000).fit(X_train_norm, y_train)
log_reg.predict_proba(X_test_norm)

---

#### Graph Neural Networks

##### Graph Structure

<div>
<img src="media/graph.jpg" width="500"/>
</div>

In [None]:
# These will be the three key identifiers
song_id_map = {sid: i for i, sid in enumerate(list(music['song.id']))}                  # There can only be one song
artist_id_map = {aid: i for i, aid in enumerate(list(music['artist.id'].unique()))}     # They can be under the same artist
release_id_map = {aid: i for i, aid in enumerate(list(music['release.id'].unique()))}   # They can be under the same release

# Tags
tag_id_map = {tid: i for i, tid in enumerate(list(music['terms'].unique()))}

In [None]:
### SONG NODE ###
song_features = ['bars_confidence', 'bars_start', 'beats_confidence', 'beats_start',
    'duration', 'end_of_fade_in', 'key', 'key_confidence', 'loudness',
    'mode', 'mode_confidence', 'start_of_fade_out', 'tatums_confidence',
    'tatums_start', 'tempo', 'time_signature', 'time_signature_confidence']

num_songs = len(music['song.id'])
num_song_feats = len(song_features)
song_x = np.zeros((num_songs, num_song_feats), dtype=np.float32)

for i, sid in enumerate(music['song.id']):
    row = music.loc[music['song.id'] == sid].iloc[0]
    song_x[i] = row[song_features].values
    
song_x = torch.tensor(song_x, dtype=torch.float32)

In [None]:
### ARTIST NODE ###
artist_features = ['artist.hotttnesss', 'artist_mbtags_count', 'familiarity']
artist_ids = music['artist.id'].unique()
num_artists = len(artist_ids)
artist_x = np.zeros((num_artists, len(artist_features)), dtype=np.float32)

for i, aid in enumerate(artist_ids):
    rows = music[music['artist.id'] == aid].iloc[0]
    artist_x[i] = rows[artist_features].values

artist_x = torch.tensor(artist_x, dtype=torch.float32)

In [None]:
### RELEASE NODE ###
num_releases = len(music['release.id'].unique())
release_x = torch.zeros((num_releases, 1), dtype=torch.float32)

num_tags = len(music['terms'].unique())
tag_x = torch.zeros((num_tags, 1), dtype=torch.float32)

In [None]:
### BUILD EDGES ###

# First row is source idx (song)
# Second row is the destination (artist)

# Song -----> Artist
song_src = []
artist_dst = []

for idx, row in music.iterrows():
    s_id = row['song.id']
    a_id = row['artist.id']
    s_idx = song_id_map[s_id]
    a_idx = artist_id_map[a_id]

    song_src.append(s_idx)
    artist_dst.append(a_idx)

song_artist_edge_index = torch.tensor([song_src, artist_dst], dtype=torch.long)

In [None]:
# Song -----> Release
song_src = []
release_dst = []

for idx, row in music.iterrows():
    s_id = row['song.id']
    r_id = row['release.id'] 
    s_idx = song_id_map[s_id]
    r_idx = release_id_map[r_id]
    song_src.append(s_idx)
    release_dst.append(r_idx)

song_release_edge_index = torch.tensor([song_src, release_dst], dtype=torch.long)


In [None]:
# Song -----> Tag

song_src = []
tag_dst = []
song_tag_weights = []

for idx, row in music.iterrows():
    s_id = row['song.id']
    s_idx = song_id_map[s_id]
    
    tag = row['terms']
    freq = row['terms_freq']

    t_idx = tag_id_map[tag]
    
    song_src.append(s_idx)
    tag_dst.append(t_idx)
    song_tag_weights.append(freq)

song_tag_edge_index = torch.tensor([song_src, tag_dst], dtype=torch.long)
song_tag_edge_attr = torch.tensor(song_tag_weights, dtype=torch.float32)

In [None]:
# Put in HeteroData
data = HeteroData()

# Assign node features
data['song'].x = song_x
data['artist'].x = artist_x
data['release'].x = release_x
data['tag'].x = tag_x

# Assign edges
data['song', 'performed_by', 'artist'].edge_index = song_artist_edge_index
data['song', 'released_on', 'release'].edge_index = song_release_edge_index
data['song', 'has_tag', 'tag'].edge_index = song_tag_edge_index
data['song', 'has_tag', 'tag'].edge_attr = song_tag_edge_attr


In [None]:
data