In [11]:
#Merge two sets of music data
import pandas as pd

# Read chord annotation file and target file
chord_data = pd.read_csv('datasets/chord_data.csv')
meta_data = pd.read_csv('datasets/meta.csv')
# Concatenate all columns from the chord annotation file to all columns from the target file
result = pd.concat([meta_data, chord_data], axis=1)
# Write the result to a new CSV file
result.to_csv('datasets/meta_with_chords.csv', index=False)

# Read key mode annotation file and merged data file
key_mode_data = pd.read_csv('datasets/key_mode_data.csv')
meta_with_chords_data = pd.read_csv('datasets/meta_with_chords.csv', low_memory=False)  
# Concatenate all columns from the key mode annotation file to all columns from the merged data file
result = pd.concat([meta_with_chords_data, key_mode_data], axis=1)
# Write the result to a new CSV file
result.to_csv('datasets/meta_with_chords_key_mode.csv', index=False)


In [16]:
import pandas as pd

merged_data = pd.read_csv('datasets/meta_with_chords_key_mode.csv', keep_default_na=False)

# If the 'id' column is empty, keep the part before the empty line
merged_data['id'] = merged_data['id'].astype(str)
merged_data['id'] = merged_data['id'].str.strip()  
empty_id_rows = merged_data['id'] == '' 

if empty_id_rows.any():
    selected_data = merged_data.loc[:empty_id_rows.idxmax() - 1]
else:
    selected_data = merged_data

selected_data.to_csv('datasets/merged_meta.csv', index=False)

merged_data = pd.read_csv('datasets/merged_meta.csv',low_memory=False)

print(merged_data.head())

              id                                  title artist release  \
0    billboard_0                       Must Of Got Lost    NaN     NaN   
1    billboard_1                       Baby I'm Burnin'    NaN     NaN   
2   billboard_10  Kokomo (From The Cocktail Soundtrack)    NaN     NaN   
3  billboard_100                     Heart Full of Soul    NaN     NaN   
4  billboard_101          Down And Out In New York City    NaN     NaN   

     duration identifiers   type genre composers         performers  ...  \
0  180.580159         NaN  audio   NaN       NaN  The J. Geils Band  ...   
1  158.484898         NaN  audio   NaN       NaN       Dolly Parton  ...   
2  217.614898         NaN  audio   NaN       NaN     The Beach Boys  ...   
3  149.394286         NaN  audio   NaN       NaN      The Yardbirds  ...   
4  286.380408         NaN  audio   NaN       NaN        James Brown  ...   

  movement_title                         jams_path      chord-file_name  \
0            NaN    dat

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv('datasets/merged_meta.csv', low_memory=False)

# Feature selection: Select appropriate feature columns based on your task
feature_columns = [ 'duration', 'release_year', 'composers','performers']
X = data[feature_columns].copy()

# Label encode string features
label_encoder = LabelEncoder()
for col in ['duration', 'release_year', 'composers','performers']:
    X[col + '_encoded'] = label_encoder.fit_transform(X[col])

# Drop original string feature columns
X.drop([ 'duration', 'release_year', 'composers','performers'], axis=1, inplace=True)

# Encode target variable
y = data['genre']
y_encoded = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.9877394636015325
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        47
           1       0.97      0.98      0.98       670
           2       1.00      1.00      1.00        44
           3       0.99      0.99      0.99      1849

    accuracy                           0.99      2610
   macro avg       0.98      0.99      0.98      2610
weighted avg       0.99      0.99      0.99      2610



In [21]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Data standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the K-fold cross-validation strategy, StratifiedKFold maintains class proportions
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model, X_scaled, y_encoded, cv=cv, scoring='accuracy')

# Print the accuracy of each cross-validation fold and the average accuracy
print(f'Accuracy of each cross-validation fold: {scores}')
print(f'Average accuracy: {scores.mean()}')


Accuracy of each cross-validation fold: [0.96666667 0.97739464 0.97471264 0.97355309 0.97048678]
Average accuracy: 0.9725627616752502


In [22]:
# Assuming label_encoder is an instance of LabelEncoder for the 'genre' column
# Create a dictionary mapping labels to their encoded values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping relationship
print(label_mapping)

# Find the category names for specific labels
print(f"Category 0 represents: {label_encoder.inverse_transform([0])[0]}")
print(f"Category 1 represents: {label_encoder.inverse_transform([1])[0]}")
print(f"Category 2 represents: {label_encoder.inverse_transform([2])[0]}")
print(f"Category 3 represents: {label_encoder.inverse_transform([3])[0]}")


{'classical': 0, 'jazz': 1, 'rock': 2, nan: 3}
Category 0 represents: classical
Category 1 represents: jazz
Category 2 represents: rock
Category 3 represents: nan


In [1]:
# Calculation of Music Similarity (Cosine Similarity)
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Read data containing selected features
data = pd.read_csv('datasets/merged_meta.csv', low_memory=False)
selected_features = ['duration', 'chord-start_time', 'chord-end_time',
                     'key_mode-start_time', 'key_mode-end_time']
# Select all required numerical features
data_selected = data[selected_features]

# Fill missing values with mode
data_selected = data_selected.fillna(data_selected.mode().iloc[0])

# Compute similarity matrix
similarity_matrix = cosine_similarity(data_selected)

# Select a piece of music (for example, select the first piece of music)
selected_music_index = 0
selected_music = data.iloc[selected_music_index]['title']

# Find the music most similar to the selected music based on the similarity matrix
similar_music_indices = similarity_matrix[selected_music_index].argsort()[::-1]
top_similar_music_indices = similar_music_indices[1:11]  # Exclude itself, select the top 10 most similar pieces of music

# Print the most similar music
similar_music_data = data.iloc[top_similar_music_indices][['title']]
print("Selected Music:", selected_music)
print("\nMost Similar Music:")
print(similar_music_data)


Selected Music: Must Of Got Lost

Most Similar Music:
                                   title
422                       Hello Stranger
8922       The Night Has A Thousand Eyes
8043        Three O'Clock In The Morning
2676                   Worried Man Blues
4062     Love (Your Spell Is Everywhere)
1154                            Get Back
5323                       Not Fade Away
9887                    I Started A Joke
4782                  Home In San Antone
12846  Nobody Knows TheTrouble I've Seen


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Read data containing selected features
data = pd.read_csv('datasets/merged_meta.csv', low_memory=False)
selected_features = ['duration','release', 'genre', 'composers', 'performers']
# Select all required numerical features
data_selected = data[selected_features]

# Perform one-hot encoding on non-numeric features
categorical_features = ['release','genre', 'composers', 'performers']  # Columns needing one-hot encoding
data_selected = pd.get_dummies(data_selected, columns=categorical_features)

# Create a LabelEncoder
label_encoder = LabelEncoder()

# Use LabelEncoder to label encode numerical features
for col in selected_features:
    if col not in categorical_features:
        data_selected[col + '_encoded'] = label_encoder.fit_transform(data_selected[col])

# Fill missing values with mode
data_selected = data_selected.fillna(data_selected.mode().iloc[0])

# Compute similarity matrix
similarity_matrix = cosine_similarity(data_selected)

# Select a piece of music (for example, select the first piece of music)
selected_music_index = 0
selected_music_title = data.iloc[selected_music_index]['title']
selected_music_artists = data.iloc[selected_music_index]['artist']

# Find the music most similar to the selected music based on the similarity matrix
similar_music_indices = similarity_matrix[selected_music_index].argsort()[::-1]
top_similar_music_indices = similar_music_indices[1:11]  # Exclude itself, select the top 10 most similar pieces of music

# Print the most similar music and artists
print("Selected Music:", selected_music_title)
print("Artists:", selected_music_artists)
print("\nMost Similar Music and Artists:")
similar_music_data = data.iloc[top_similar_music_indices][['title', 'artist']]
print(similar_music_data)


Selected Music: Must Of Got Lost
Artists: nan

Most Similar Music and Artists:
                         title                            artist
2828            Elusive Dreams                               NaN
3250                  Serenata                               NaN
2988        Pink Panther Theme                 ['Henry Mancini']
3411       Let Your Glory Fall                               NaN
4264               Slow Change                               NaN
4358                 Work Song  ["Julian 'Cannonball' Adderley"]
871    Shake Your Groove Thing                               NaN
92            Against The Wind                               NaN
10863               Sierra Sue                               NaN
8902             The Holy City                               NaN
