## Libraries

In [2]:
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
import warnings

## Importing Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [17]:
# Define the folder path using your Drive mount and folder ID
gtzan_folder_path = '/content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original'  # Adjust if the folder is nested within other folders

# List all files in the folder
for root, dirs, files in os.walk(gtzan_folder_path):
    print(f"Found directory: {root}")

Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/rock
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/jazz
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/reggae
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/country
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/classical
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/hiphop
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/metal
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_original/disco
Found directory: /content/drive/MyDrive/project_data_source/gtzan_dataset/Data/genres_origi

In [5]:
# Define the folder path using your Drive mount and folder ID
fma_folder_path = '/content/drive/MyDrive/project_data_source/free_music_archive/fma_metadata'

# List all files in the folder
for root, dirs, files in os.walk(fma_folder_path):
    print(f"Found directory: {root}")
    for file in files:
        print(f" - {file}")

Found directory: /content/drive/MyDrive/project_data_source/free_music_archive/fma_metadata
 - not_found.pickle
 - checksums
 - raw_tracks.csv
 - raw_artists.csv
 - genres.csv
 - raw_albums.csv
 - features.csv
 - tracks.csv
 - echonest.csv
 - README.txt
 - raw_genres.csv
 - raw_echonest.csv


In [6]:
# Suppress DtypeWarning to avoid unnecessary output
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

# Define the folder path to the FMA metadata
fma_folder_path = '/content/drive/MyDrive/project_data_source/free_music_archive/fma_metadata'

# List all files in the folder for verification
for root, dirs, files in os.walk(fma_folder_path):
    print(f"Found directory: {root}")
    for file in files:
        print(f" - {file}")

# Initialize an empty dictionary to store DataFrames
fma_dfs = {}

# Loop through the files and load each CSV into a DataFrame
for filename in os.listdir(fma_folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(fma_folder_path, filename)
        df_name = filename[:-4]  # Remove the .csv extension for DataFrame name
        print(f'Loading {filename}...')

        try:
            # Load the CSV file and store it in the dictionary
            fma_dfs[df_name] = pd.read_csv(file_path, low_memory=False)
            print(f'Successfully loaded {df_name} with {fma_dfs[df_name].shape[0]} rows.')
        except Exception as e:
            print(f'Error loading {filename}: {e}')

Found directory: /content/drive/MyDrive/project_data_source/free_music_archive/fma_metadata
 - not_found.pickle
 - checksums
 - raw_tracks.csv
 - raw_artists.csv
 - genres.csv
 - raw_albums.csv
 - features.csv
 - tracks.csv
 - echonest.csv
 - README.txt
 - raw_genres.csv
 - raw_echonest.csv
Loading raw_tracks.csv...
Successfully loaded raw_tracks with 109727 rows.
Loading raw_artists.csv...
Successfully loaded raw_artists with 16916 rows.
Loading genres.csv...
Successfully loaded genres with 163 rows.
Loading raw_albums.csv...
Successfully loaded raw_albums with 15234 rows.
Loading features.csv...
Successfully loaded features with 106577 rows.
Loading tracks.csv...
Successfully loaded tracks with 106576 rows.
Loading echonest.csv...
Successfully loaded echonest with 13132 rows.
Loading raw_genres.csv...
Successfully loaded raw_genres with 164 rows.
Loading raw_echonest.csv...
Successfully loaded raw_echonest with 14514 rows.


In [7]:
fma_raw_tracks_df = fma_dfs['raw_tracks']
fma_raw_artists_df = fma_dfs['raw_artists']
fma_raw_albums_df = fma_dfs['raw_albums']
fma_raw_genres_df = fma_dfs['raw_genres']
fma_features_df = fma_dfs['features']
fma_tracks_df = fma_dfs['tracks']
fma_echonest_df = fma_dfs['echonest']
fma_raw_echonest_df = fma_dfs['raw_echonest']

## Feature Extraction

### Extract audio features from fma_small Dataset

In [None]:
# Define the path to the fma_small directory
audio_root_folder = '/content/drive/MyDrive/project_data_source/free_music_archive/fma_small'

# Function to extract features from a single audio file
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)

        # Spectral Centroid
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
        spectral_centroids_mean = np.mean(spectral_centroids)
        spectral_centroids_delta_mean = np.mean(librosa.feature.delta(spectral_centroids))
        spectral_centroids_accelerate_mean = np.mean(librosa.feature.delta(spectral_centroids, order=2))

        # Spectral Bandwidth
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        spectral_bandwidth_mean = np.mean(spectral_bandwidth)

        # Spectral Rolloff
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        spectral_rolloff_mean = np.mean(spectral_rolloff)

        # Zero Crossing Rate
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
        zero_crossing_rate_mean = np.mean(zero_crossing_rate)

        # RMS (Root Mean Square) Energy
        rms = librosa.feature.rms(y=y)
        rms_mean = np.mean(rms)

        # Chroma STFT
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_stft_mean = np.mean(chroma_stft)

        # MFCCs (Mel-Frequency Cepstral Coefficients)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfccs_mean = np.mean(mfccs, axis=1).mean()

        # Return all features as a dictionary
        return {
            'spectral_centroids_mean': spectral_centroids_mean,
            'spectral_centroids_delta_mean': spectral_centroids_delta_mean,
            'spectral_centroids_accelerate_mean': spectral_centroids_accelerate_mean,
            'spectral_bandwidth_mean': spectral_bandwidth_mean,
            'spectral_rolloff_mean': spectral_rolloff_mean,
            'zero_crossing_rate_mean': zero_crossing_rate_mean,
            'rms_mean': rms_mean,
            'chroma_stft_mean': chroma_stft_mean,
            'mfccs_mean': mfccs_mean
        }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None  # Return None if there’s an error

# Initialize an empty list to store feature dictionaries
features_list = []

# Walk through all subdirectories in the fma_small folder
for root, dirs, files in os.walk(audio_root_folder):
    for file in files:
        if file.endswith('.wav') or file.endswith('.mp3'):
            file_path = os.path.join(root, file)
            folder_name = os.path.basename(root)
            print(f'Processing {file_path}...')

            # Extract features and store them with additional metadata
            features = extract_features(file_path)
            if features:
                features['track_name'] = file
                features['folder_name'] = folder_name
                features_list.append(features)

# Convert the list of dictionaries into a DataFrame
features_df = pd.DataFrame(features_list)

# Display the DataFrame
print(features_df)

In [72]:
# Rename features_df to fma_features_df for clarification
fma_features_df = features_df

### Extract audio features from GTZAN Dataset

In [None]:
# Initialize an empty list to store feature dictionaries
gtzan_features_list = []

# Walk through all subdirectories in the gtzan folder (each folder is a genre)
for root, dirs, files in os.walk(gtzan_folder_path):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)
            genre_name = os.path.basename(root)
            print(f'Processing {file_path}...')

            # Extract features and store them with additional metadata
            features = extract_features(file_path)
            if features:
                features['track_name'] = file
                features['genre'] = genre_name
                gtzan_features_list.append(features)

# Convert the list of dictionaries into a DataFrame
gtzan_features_df = pd.DataFrame(gtzan_features_list)

# Display the DataFrame
print(gtzan_features_df)

### Combining and saving datarames to Google Drive

In [75]:
# Define the paths to save the CSV files in your Google Drive
fma_csv_path = '/content/drive/MyDrive/project_data_source/fma_features_df.csv'
gtzan_csv_path = '/content/drive/MyDrive/project_data_source/gtzan_features_df.csv'

# Save the FMA features DataFrame
features_df.to_csv(fma_csv_path, index=False)
print(f'FMA features saved to {fma_csv_path}')

# Save the GTZAN features DataFrame
gtzan_features_df.to_csv(gtzan_csv_path, index=False)
print(f'GTZAN features saved to {gtzan_csv_path}')

FMA features saved to /content/drive/MyDrive/project_data_source/fma_features_df.csv
GTZAN features saved to /content/drive/MyDrive/project_data_source/gtzan_features_df.csv


In [80]:
# Add a 'source' column to each DataFrame to indicate the data source
features_df['source'] = 'fma'
gtzan_features_df['source'] = 'gtzan'

# Concatenate the two DataFrames along the rows (axis=0)
fma_gtzan_combined_features_df = pd.concat([features_df, gtzan_features_df], axis=0, ignore_index=True)

# Display the combined DataFrame
print(fma_gtzan_combined_features_df)

# Define the path to save the combined DataFrame to Google Drive
combined_csv_path = '/content/drive/MyDrive/project_data_source/fma_gtzan_combined_features_df.csv'

# Save the combined DataFrame to a CSV file in Google Drive
fma_gtzan_combined_features_df.to_csv(combined_csv_path, index=False)
print(f'Combined features DataFrame saved to {combined_csv_path}')

      spectral_centroids_mean  spectral_centroids_delta_mean  \
0                 2327.924921                      -0.299136   
1                 1490.787545                      -0.869474   
2                 2116.290527                      -0.023952   
3                 2577.717683                      -2.472239   
4                 2061.502272                       0.090942   
...                       ...                            ...   
9044              1388.632894                       0.586753   
9045              1538.512366                      -0.278298   
9046              1132.171811                      -0.258750   
9047              1236.310512                       0.102478   
9048              2310.352908                       0.313398   

      spectral_centroids_accelerate_mean  spectral_bandwidth_mean  \
0                              -0.120772              3164.002232   
1                               0.273862              2288.657213   
2                       