In [83]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
import numpy as np
import os
import librosa
import csv

In [209]:
def extract_features(y, sr):
    #y, sr = librosa.load(audio_file, mono=True)
    # Preprocess audio file
    #y = librosa.util.normalize(y)   # Normalize audio to have values between -1 and 1
    #y = librosa.effects.trim(y)[0]  # Trim silence from beginning and end of audio
    # Extract features
    chroma_stft_mean, chroma_stft_var = get_chroma_stft(y, sr)
    rms_mean, rms_var = get_rms(y)
    spectral_centroid_mean, spectral_centroid_var = get_spectral_centroid(y, sr)
    spectral_bandwidth_mean, spectral_bandwidth_var = get_spectral_bandwidth(y, sr)
    rolloff_mean, rolloff_var = get_rolloff(y, sr)
    zero_crossing_rate_mean, zero_crossing_rate_var = get_zero_crossing_rate(y)
    harmony_mean, harmony_var = get_harmony(y)
    perceptr_mean, perceptr_var = get_perceptr(y)
    tempo = get_tempo(y)
    mfcc = get_mfcc(y, sr)
    features = np.array([chroma_stft_mean, chroma_stft_var, rms_mean, rms_var,
       spectral_centroid_mean, spectral_centroid_var,
       spectral_bandwidth_mean, spectral_bandwidth_var, rolloff_mean,
       rolloff_var, zero_crossing_rate_mean, zero_crossing_rate_var,
       harmony_mean, harmony_var, perceptr_mean, perceptr_var, tempo, *mfcc])
    write_features_to_csv(features)
    return features

In [210]:
def get_chroma_stft(y, sr):
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_stft_mean = np.mean(chroma_stft)
    chroma_stft_var = np.var(chroma_stft)
    return chroma_stft_mean, chroma_stft_var

def get_rms(y):
    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)
    rms_var = np.var(rms)
    return rms_mean, rms_var

def get_spectral_centroid(y, sr):
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectral_centroid_mean = np.mean(spectral_centroid)
    spectral_centroid_var = np.var(spectral_centroid)
    return spectral_centroid_mean, spectral_centroid_var

def get_spectral_bandwidth(y, sr):
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_bandwidth_mean = np.mean(spectral_bandwidth)
    spectral_bandwidth_var = np.var(spectral_bandwidth)
    return spectral_bandwidth_mean, spectral_bandwidth_var

def get_rolloff(y, sr):
    rolloff = librosa.feature.spectral_rolloff(y=y+0.01, sr=sr)[0]
    rolloff_mean = np.mean(rolloff)
    rolloff_var = np.var(rolloff)
    return rolloff_mean, rolloff_var

def get_zero_crossing_rate(y):
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    zero_crossing_rate_mean = np.mean(zero_crossing_rate)
    zero_crossing_rate_var = np.var(zero_crossing_rate)
    return zero_crossing_rate_mean, zero_crossing_rate_var

def get_harmony(y):
    harmony = librosa.effects.harmonic(y)
    harmony_mean = np.mean(harmony)
    harmony_var = np.var(harmony)
    return harmony_mean, harmony_var

def get_perceptr(y):
    perceptr = librosa.effects.percussive(y)
    perceptr_mean = np.mean(perceptr)
    perceptr_var = np.var(perceptr)
    return perceptr_mean, perceptr_var

def get_tempo(y):
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    tempo, _ = librosa.beat.beat_track(y=y_percussive, sr=sr)
    return tempo

def get_mfcc(y, sr):
    # Define MFCC parameters
    n_mfcc = 20    # Number of MFCC coefficients to calculate
    #hop_length = 512   # Hop length between consecutive frames in samples (around 23ms)
    #n_fft = 2048   # Size of FFT window in samples (around 93ms)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr)#, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)

    # Aggregate MFCC features into a single feature vector
    #mfcc_mean = np.mean(mfccs)
    #mfcc_var = np.var(mfccs)
    mfcc_features = _format_mfcc(n_mfcc, mfccs)
    #mfcc_features = (mfcc_features - np.mean(mfcc_features)) / np.std(mfcc_features)
    return mfcc_features

def _format_mfcc(n_mfcc, mfccs):
    #print(f"mfccs={len(mfccs)}")
    #print(f"type(mfccs)= {type(mfccs)}")
    mfcc_features = []
    for i in range(n_mfcc):
        mfcc_features.append(np.mean(mfccs[i]))
        mfcc_features.append(np.var(mfccs[i]))
    mfcc_features = np.array(mfcc_features)
    return mfcc_features

In [211]:
audio_path = f"{path}/dataset/songs/One_more_time.wav"
def extract_features_30s(audio_path):
    y, sr = librosa.load(audio_path)
    #y = librosa.util.normalize(y)
    #y = librosa.effects.trim(y)[0]
    # Calculate the duration of the audio file in seconds
    duration = librosa.get_duration(y=y, sr=sr)
    # Define the segment length in seconds
    segment_length = 30
    # Calculate the number of segments in the audio file
    num_segments = int(np.ceil(duration/segment_length))
    # Initialize an empty array to store the features for each segment
    features = np.empty((0, 57))
    for i in range(num_segments):
        # Calculate the start and end time of the segment
        start_time = i*segment_length
        end_time = min((i+1)*segment_length, duration)
        # Extract the audio segment
        segment = y[int(start_time*sr):int(end_time*sr)]
        segment_features = extract_features(segment, sr)
        # Append the segment features to the array
        features = np.vstack([features, segment_features])
    write_features_to_csv(features)
    return features

In [243]:
def write_features_to_csv(features):
    header = ['chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var',
       'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
       'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
       'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
       'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
       'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
       'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
       'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
       'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
       'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
       'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
       'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var']
    with open(f'{path}/dataset/songs/One_more_time.csv', 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
        writer.writerow(features)

In [249]:
#csv_path = f'{path}/dataset/songs/One_more_time.csv'
csv_path = f'{path}/dataset/songs/One_more_time.csv'

def prediction(csv_path, min_max_scaler):
    X = pd.read_csv(csv_path)
    #min_max_scaler = preprocessing.MinMaxScaler()
    features = min_max_scaler.transform(X)
    preds = svm.predict(features)
    print(preds)

In [250]:
x

{'tempo': 92.28515625,
 'chroma_stft': 0.37853348,
 'rmse': 0.17617546,
 'spectral_centroid': 2702.3643469585845,
 'spectral_bandwidth': 2606.7368250656746,
 'rolloff': 5517.7401719316495,
 'zero_crossing_rates': 0.1363990228795869,
 'mfcc1': -64.13336,
 'mfcc2': 78.183075,
 'mfcc3': -3.088391,
 'mfcc4': 12.933981,
 'mfcc5': -2.950808,
 'mfcc6': 6.233242,
 'mfcc7': -2.5770497,
 'mfcc8': -2.1206977,
 'mfcc9': -2.009457,
 'mfcc10': 1.5667692,
 'mfcc11': -7.5950356,
 'mfcc12': 1.4802297,
 'mfcc13': -2.7571547,
 'mfcc14': -1.3600029,
 'mfcc15': -3.16765,
 'mfcc16': -2.221978,
 'mfcc17': -2.5267854,
 'mfcc18': 2.1491597,
 'mfcc19': -0.3861255,
 'mfcc20': 1.7829368}

In [251]:
data = pd.read_csv(f"{path}/dataset/features_3_sec.csv")
data.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [252]:
data = data.iloc[0:, 2:]
data.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,3714.560359,1080790.0,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,3869.682242,672244.8,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,3997.63916,790712.7,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,3568.300218,921652.4,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,3469.992864,610211.1,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [253]:
y = data['label']
X = data.loc[:, data.columns != 'label']
cols = X.columns


In [254]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [255]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

In [256]:
svm = SVC(kernel = 'linear', C=10)
svm.fit(X_train_scaled,y_train)
preds = svm.predict(X_test_scaled)
accuracy_score(y_test, preds)

0.7510008006405124

In [176]:
audio_path = f"{path}/dataset/songs/One_more_time.wav"
#audio_path = f"{path}/dataset/songs/Shape_of_You.mp3"
#audio_path = f"{path}/dataset/songs/blues.wav"
y, sr = librosa.load(audio_path)
features = extract_features(y, sr)

In [177]:
print(features)

[ 3.78533483e-01  8.88253972e-02  1.76175460e-01  7.74980010e-03
  2.70236435e+03  9.30865965e+05  2.60673683e+03  2.11175803e+05
  5.51774017e+03  3.12873869e+06  1.36399023e-01  7.85133303e-03
  6.82138932e-07  1.57421958e-02  3.13425262e-05  1.21626044e-02
  9.22851562e+01 -6.41333618e+01  1.00741553e+04  7.81830750e+01
  1.10429858e+03 -3.08839107e+00  5.30809631e+02  1.29339809e+01
  3.28720337e+02 -2.95080805e+00  2.01351410e+02  6.23324203e+00
  2.19855698e+02 -2.57704973e+00  1.34776443e+02 -2.12069774e+00
  1.34806717e+02 -2.00945711e+00  1.24362038e+02  1.56676924e+00
  9.58758240e+01 -7.59503555e+00  8.42693024e+01  1.48022974e+00
  6.43318634e+01 -2.75715470e+00  6.20210533e+01 -1.36000288e+00
  5.26567154e+01 -3.16764998e+00  5.15809059e+01 -2.22197795e+00
  4.42247276e+01 -2.52678537e+00  5.46556129e+01  2.14915967e+00
  5.78212128e+01 -3.86125505e-01  6.14318962e+01  1.78293681e+00
  5.85684929e+01]


In [180]:
features = features.reshape(1,-1)
features = min_max_scaler.transform(features)
svm.predict(features)[0]



'pop'

In [188]:
audio_path = f"{path}/dataset/songs/One_more_time.mp3"
#audio_path = f"{path}/dataset/songs/Shape_of_You.mp3"
#audio_path = f"{path}/dataset/songs/blues.wav"
y, sr = librosa.load(audio_path)
features2 = extract_features(y, sr)

In [189]:
print(features2)

[ 4.01391745e-01  9.04501975e-02  1.28638044e-01  5.17222285e-03
  2.87338514e+03  9.95404319e+05  2.61698725e+03  2.53957967e+05
  5.75077118e+03  2.95144606e+06  1.52879817e-01  9.91195340e-03
 -7.34094065e-05  9.18927323e-03 -2.61925830e-04  6.59426441e-03
  9.22851562e+01 -1.02868195e+02  1.44051895e+04  6.58814545e+01
  1.41450195e+03 -3.96973491e+00  5.38295044e+02  1.57871037e+01
  3.15022888e+02 -9.38173950e-01  1.98583679e+02  9.64153767e+00
  2.05775055e+02 -5.33809602e-01  1.36352570e+02  1.69455600e+00
  1.32618713e+02  4.71090004e-02  1.17268860e+02  3.26166868e+00
  9.29001389e+01 -4.86849546e+00  8.25630341e+01  3.22355700e+00
  6.14592857e+01 -9.94409919e-01  6.13182449e+01  1.50860101e-01
  5.15999260e+01 -1.72866750e+00  4.96832314e+01 -7.00210333e-01
  4.36195641e+01 -1.23700511e+00  5.36341896e+01  2.84968185e+00
  5.47892838e+01 -9.16692093e-02  6.66464691e+01  2.65445781e+00
  5.79014549e+01]


In [190]:
features2 = features2.reshape(1,-1)
features2 = min_max_scaler.transform(features2)
svm.predict(features2)



array(['pop'], dtype=object)

In [261]:
#audio_path = f"{path}/dataset/songs/blues.wav"
audio_path = f"{path}/dataset/songs/Shape_of_You.mp3"
#audio_path = f"{path}/dataset/songs/blues.wav"
y, sr = librosa.load(audio_path)
features = extract_features(y, sr)

In [262]:
prediction(csv_path, min_max_scaler)

['hiphop']
