In [26]:
import pandas as pd
import os
from pathlib import Path

In [27]:
def check_duplicates(data, id_column='spotify_song_id'):
    """Check for duplicate rows and duplicate IDs in a DataFrame."""
    # Check for duplicate rows
    print(f"Number of duplicate rows: {data.duplicated().sum()}")
    print(f"Total rows: {len(data)}")

    # View the duplicate rows if any exist
    if data.duplicated().sum() > 0:
        print("\nDuplicate rows:")
        print(data[data.duplicated(keep=False)])

    # Check for duplicate ID values
    print(f"\nNumber of duplicate {id_column}: {data[id_column].duplicated().sum()}")
    print(f"Total rows: {len(data)}")

    # View rows with duplicate ID if any exist
    if data[id_column].duplicated().sum() > 0:
        duplicates = data[data[id_column].duplicated(keep=False)]
        print(f"\nRows with duplicate {id_column}: {len(duplicates)}")
        print(duplicates.sort_values(id_column))

In [28]:
data_folder_path = Path('../../data/')
print(os.listdir(data_folder_path))

['chordonomicon_raw.csv', 'final_test.csv', 'matrix_expanded_final_train.csv', 'density_expanded_final_train.csv', 'data_train_continuous_data.csv', 'spotify_final.csv', 'clean_test_with_4_grams.csv', 'final_train.csv', 'data_train_n_grams.csv', 'clean_test_with_5_grams.csv', 'clean_test_with_3_grams.csv']


In [29]:
data_train_3_grams = pd.read_csv(data_folder_path / 'clean_test_with_3_grams.csv')
print(data_train_3_grams.columns)
check_duplicates(data_train_3_grams)

Index(['chords', 'simplified_chords', 'decade', 'main_genre',
       'spotify_song_id', 'contains_G,C,G', 'contains_C,G,C', 'contains_C,G,D',
       'contains_C,G,Amin', 'contains_C,D,G', 'contains_D,G,C',
       'contains_Emin,C,G', 'contains_D,C,G', 'contains_G,Amin,F',
       'contains_G,D,C', 'contains_Amin,G,F', 'contains_G,C,D',
       'contains_Amin,F,G', 'contains_F,G,Amin', 'contains_G,F,G',
       'contains_Amin,C,G', 'contains_G,Amin,G', 'contains_G,C,Amin',
       'contains_Amin,G,C', 'contains_C,G,Emin', 'contains_G,Amin,C',
       'contains_G,Emin,C', 'contains_F,Amin,G', 'contains_C,D,C',
       'contains_C,Amin,G', 'contains_G,C,Emin', 'contains_Amin,G,Amin',
       'contains_C,Amin,C', 'contains_Emin,G,C', 'contains_G,F,Amin',
       'contains_Amin,D,G', 'contains_G,D,Amin'],
      dtype='object')
Number of duplicate rows: 0
Total rows: 255606

Number of duplicate spotify_song_id: 0
Total rows: 255606


In [30]:
data_train_4_grams = pd.read_csv(data_folder_path / 'clean_test_with_4_grams.csv')
print(data_train_4_grams.columns)
check_duplicates(data_train_4_grams)

Index(['chords', 'simplified_chords', 'decade', 'main_genre',
       'spotify_song_id', 'contains_C,G,C,G', 'contains_G,C,G,C',
       'contains_F,C,G,Amin', 'contains_C,G,Amin,F', 'contains_Amin,F,C,G',
       'contains_G,D,C,G', 'contains_C,G,D,C', 'contains_G,Amin,F,C',
       'contains_D,C,G,D', 'contains_D,G,C,G', 'contains_C,D,G,C',
       'contains_G,C,D,G', 'contains_G,C,G,D', 'contains_C,G,D,G',
       'contains_D,G,C,D', 'contains_G,D,G,C', 'contains_G,Amin,F,G',
       'contains_Amin,F,G,C', 'contains_F,G,Amin,F', 'contains_F,G,F,G',
       'contains_Amin,G,F,C', 'contains_Amin,G,Amin,G',
       'contains_Amin,F,G,Amin', 'contains_G,Amin,G,Amin',
       'contains_G,Emin,C,G', 'contains_C,G,Amin,C', 'contains_G,Amin,C,G',
       'contains_G,C,G,Amin', 'contains_C,D,G,D', 'contains_D,C,D,C',
       'contains_F,Amin,G,F', 'contains_C,G,Emin,C', 'contains_Amin,C,G,Amin',
       'contains_F,G,C,Amin', 'contains_C,Amin,F,G', 'contains_G,F,G,C',
       'contains_G,Amin,G,F', 'conta

In [31]:
data_train_5_grams = pd.read_csv(data_folder_path / 'clean_test_with_5_grams.csv')
print(data_train_5_grams.columns)
check_duplicates(data_train_5_grams)

Index(['chords', 'simplified_chords', 'decade', 'main_genre',
       'spotify_song_id', 'contains_G,C,G,C,G', 'contains_C,G,C,G,C',
       'contains_C,G,Amin,F,C', 'contains_F,C,G,Amin,F',
       'contains_G,Amin,F,C,G', 'contains_Amin,F,C,G,Amin',
       'contains_C,G,D,C,G', 'contains_G,D,C,G,D', 'contains_D,C,G,D,C',
       'contains_D,G,C,D,G', 'contains_G,C,D,G,C', 'contains_C,D,G,C,D',
       'contains_G,C,G,D,G', 'contains_G,D,G,C,G', 'contains_C,G,D,G,C',
       'contains_D,G,C,G,D', 'contains_F,G,Amin,F,G',
       'contains_G,Amin,F,G,Amin', 'contains_G,Amin,G,Amin,G',
       'contains_Amin,G,Amin,G,Amin', 'contains_C,G,Emin,C,G',
       'contains_Amin,F,G,Amin,F', 'contains_C,G,Amin,C,G',
       'contains_G,F,G,F,G', 'contains_Amin,G,F,Amin,G',
       'contains_C,Amin,F,G,C', 'contains_Emin,C,G,Emin,C',
       'contains_G,Amin,C,G,Amin', 'contains_G,F,Amin,G,F',
       'contains_F,G,F,G,F', 'contains_G,C,Amin,F,G', 'contains_F,G,C,Amin,F',
       'contains_Amin,C,G,Amin,C', '

In [32]:
common_cols = ['spotify_song_id', 'chords', 'simplified_chords', 'decade', 'main_genre']

merged_data = (
    data_train_3_grams
    .merge(data_train_4_grams, on=common_cols, how='inner')
    .merge(data_train_5_grams, on=common_cols, how='inner')
)

print(f"Merged data shape: {merged_data.shape}")

Merged data shape: (255606, 114)


In [33]:
# Get boolean columns (all columns except common_cols)
bool_cols = [col for col in merged_data.columns if col not in common_cols]

# Convert boolean columns to int (False -> 0, True -> 1)
merged_data[bool_cols] = merged_data[bool_cols].astype(int)

print(f"Converted {len(bool_cols)} boolean columns to 0/1")

Converted 109 boolean columns to 0/1


In [34]:
# Save merged data to CSV
output_path = data_folder_path / 'data_train_n_grams.csv'
merged_data.to_csv(output_path, index=False)
print(f"Saved merged data to {output_path}")

Saved merged data to ../../data/data_train_n_grams.csv
