# Dimensionality Reduction
Input: the dataset with all labels.
Output: a dataset with the new (fewer) labels.

Label all instruments, add those together that have high correlation.
Make a t-sne space for the genres.


In [None]:
import pandas as pd

import importlib
from IPython.display import display, Audio

import preprocess
import dimensionality_reduction
importlib.reload(preprocess)
importlib.reload(dimensionality_reduction)

pd.set_option('display.float_format', lambda x: f'{x:.3f}')

In [None]:
metadata_path = '../feature_extraction/audio_features/20240616_4698tracks.csv'
mp3_folder = '../user_evaluation_app/static/mp3_previews'
output_folder = 'fewer_labels_audio_feautures'

In [None]:
# Load the dataset
metadata = pd.read_csv(metadata_path, index_col='track_id')
# select the columns that start with MTT
mtt_metadata = metadata.filter(regex='^MTT', axis=1)


In [None]:
dimensionality_reduction.list_strongest_correlations(mtt_metadata, 50)

# Label merging

In [None]:
pairs_to_merge = [('MTT choir', 'MTT choral', 'Choir'), 
                    ('MTT male', 'MTT male vocal', 'Male Vocal'),
                    ('MTT classical', 'MTT classic', 'Classical'),
                    ('MTT man', 'Male Vocal', 'M Vocal'),
                    ('MTT male voice', 'M Vocal', 'M Voice'),
                    ('MTT vocal', 'MTT vocals', 'Vocals'), 
                    ('MTT singing', 'Vocals', 'VoiceA'),                    
                    ('MTT female', 'MTT female vocal', 'F Vocal'),
                    ('MTT woman', 'F Vocal', 'F Voice'),
                    ('MTT beat', 'MTT beats', 'Beats'),
                    ('MTT female voice', 'F Voice', 'F VoiceA'),
                    ('MTT quiet', 'MTT soft', 'Quiet'),
                    ('MTT strings', 'MTT violin', 'StringsA'),
                    ('MTT no vocals', 'MTT no vocal', 'No Vocals'),
                    ('MTT no voice', 'No Vocals', 'No Voice'),
                    ('MTT voice', 'M Voice', 'VoiceB'),
                    ('VoiceA', 'VoiceB', 'VoiceC'),
                    ('MTT cello', 'StringsA', 'Strings'),
                    ('F VoiceA', 'VoiceC', 'Voice'),
                    ('MTT harpsichord', 'MTT harp', 'Harp and Harpsichord'),
                    ]

In [None]:
new_df_avg = dimensionality_reduction.create_averaged_columns(mtt_metadata, pairs_to_merge)
new_df_sum = dimensionality_reduction.create_summed_columns(mtt_metadata, pairs_to_merge)
dimensionality_reduction.list_strongest_correlations(new_df_sum, 50)

In [None]:
print(f"Max value: {new_df_sum.describe().loc['max'].max()}, which remains below 1.")
new_df_sum.describe().loc['max'].sort_values(ascending=False)

Check that by merging by adding the columns together, the predictions do not exceed 1. They do not, so the merging might work best by adding. Check this manually.

# Rename the labels, merge with the untouched metadata


In [None]:
new_colnames = {'MTT guitar': 'guitar', 'MTT slow': 'slow', 'MTT techno': 'techno', 
                'MTT drums': 'drums', 'MTT electronic': 'electronic', 'MTT rock': 'rock', 
                'MTT fast': 'fast', 'MTT piano': 'piano', 'MTT ambient': 'ambient', 'MTT synth': 'synth',
                'MTT indian': 'indian', 'MTT opera': 'opera', 'Harp and Harpsichord': 'harp', 'MTT loud': 'loud',
                'MTT flute': 'flute', 'MTT pop': 'pop', 'MTT sitar': 'sitar', 'MTT solo': 'solo', 'MTT new age': 'new age',
                'MTT dance': 'dance', 'MTT harp': 'harp', 'MTT weird': 'weird', 'MTT country': 'country', 'MTT metal': 'metal',
                'Choir': 'choir', 'Classical': 'classical', 'Beats': 'beats', 'Quiet': 'quiet', 'No Voice': 'no voice',
                'Strings': 'strings', 'Voice': 'voice'}


new_df_sum_renamed_labels = new_df_sum.rename(columns=new_colnames)
new_df_avg_renamed_labels = new_df_avg.rename(columns=new_colnames)

new_df_renamed_labels = new_df_sum_renamed_labels
new_df_renamed_labels['voice'] = new_df_renamed_labels['voice'] /2
new_df_renamed_labels['strings'] = new_df_renamed_labels['strings']

display(new_df_renamed_labels.columns)

Lower voice channel slightly by dividing by 2.

In [None]:
instruments = ['voice', 'choir', 'harp', 'strings', 'no voice', 'guitar', 'drums', 'piano', 'flute', 'sitar', 'synth']
genres = ['ambient', 'techno', 'electronic', 'rock', 'pop', 'dance', 'classical', 'opera', 'new age', 'indian', 'weird', 'country', 'metal']
other = ['fast', 'slow', 'loud', 'solo', 'beats','quiet']
print(f"{len(new_df_renamed_labels.columns)} labels = {len(genres)+len(instruments)+len(other)} labels")

In [None]:
rename_dict = {col: f'o {col}' for col in other}
rename_dict.update({col: f'i {col}' for col in instruments})
rename_dict.update({col: f'g {col}' for col in genres})
new_df_renamed_labels = new_df_renamed_labels.rename(columns=rename_dict)
new_df_renamed_labels

In [None]:
# Merge the DataFrames on 'track_id', but only keep the columns that do not start with 'MTT'
merged_df = pd.merge(metadata.filter(regex='^(?!MTT)', axis=1), new_df_renamed_labels, on="track_id")
merged_df

# PCA for the genres

In [None]:
genre_features = merged_df.filter(regex='^g', axis=1)
genre_features

In [None]:
import pandas as pd
from sklearn.manifold import TSNE

# Create a TSNE instance
tsne_2d = TSNE(n_components=2, random_state=1)

# Fit and transform the data
tsne_components_2d = tsne_2d.fit_transform(genre_features)

# Convert t-SNE components to a DataFrame
tsne_2d_genre_df = pd.DataFrame(data=tsne_components_2d, columns=[f'TSNE{i+1}' for i in range(tsne_components_2d.shape[1])], index=genre_features.index)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))  
plt.scatter(tsne_2d_genre_df['TSNE1'], tsne_2d_genre_df['TSNE2'], c='lightblue', label='Data Points')  

plt.title('t-SNE 2D Scatter Plot')
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')

plt.legend()
plt.show()

In [None]:
#linearly scale the 2d tsne to 0-1
def scale_column(col):
    return (col - col.min()) / (col.max() - col.min())

tsne_2d_genre_df_scaled = tsne_2d_genre_df.apply(scale_column, axis = 0)
tsne_2d_genre_df_scaled.describe()

In [None]:
# Prefix the columns of tsne_2d_genre_df with "g "
tsne_2d_genre_df_prefixed = tsne_2d_genre_df_scaled.add_prefix('g ')
without_old_genres = merged_df.filter(regex='^g', axis=1)
with_PCA_genres_df = merged_df.merge(tsne_2d_genre_df_prefixed, on='track_id')
with_PCA_genres_df

In [None]:
# add new columns to input dataframe
removed_genres = merged_df.filter(regex='^(?!g)', axis=1)
removed_genres
merged_df_output = pd.merge(removed_genres, tsne_2d_genre_df_prefixed, on='track_id')
merged_df_output

# Save to file

In [None]:
merged_df_output.to_csv("dimensionality_reduced.csv", index_label='track_id')
# load with
#pd.read_csv("dimensionality_reduced.csv", index_col='track_id')