In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf 


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="-1"    


In [3]:
current = os.getcwd()
audio = 'Audio_Files'

working_dir = os.path.join(current, audio)

In [4]:
file_paths = []
for folder, _, filename in os.walk(working_dir):
    for file in filename:
        paths = os.path.join(folder, file)
        file_paths.append(paths)
        

In [5]:
major=[]
minor=[]
for file in file_paths:
    if "Major" in file:
        major.append(1)
    elif "Minor" in file:
        minor.append(0)

In [6]:
target = np.array(major+minor)

In [7]:
def wav(file):
    audio_data, sample_rate = librosa.load(file, sr=16000)
    return audio_data

In [8]:
wav(file_paths[1])

array([ 2.4853573e-08, -2.2934969e-08,  2.1035119e-08, ...,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32)

In [9]:
def preprocess(audio_file, label):
    audio = wav(audio_file)
    fixed_length = audio[:48000]
    # Calculate the number of zeros needed based on the difference in lengths
    zeros_needed = tf.zeros([48000 - tf.shape(fixed_length)[0]], dtype=tf.float32)

    fixed_length = tf.concat([fixed_length, zeros_needed], 0)
    spectrogram = tf.signal.stft(fixed_length, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    
    return spectrogram, label

In [10]:
spectrogram, label = preprocess(file_paths[1], 1)
spectrogram

<tf.Tensor: shape=(1491, 257, 1), dtype=float32, numpy=
array([[[7.5837564e-01],
        [6.8262088e-01],
        [4.8948887e-01],
        ...,
        [1.6071854e-06],
        [1.4305891e-06],
        [4.7683716e-07]],

       [[9.6100295e-01],
        [8.3555573e-01],
        [5.3843743e-01],
        ...,
        [1.7393556e-07],
        [1.7721735e-06],
        [3.5762787e-07]],

       [[1.0914071e+00],
        [9.0493745e-01],
        [5.0364572e-01],
        ...,
        [1.2846149e-06],
        [1.3102858e-06],
        [1.0132790e-06]],

       ...,

       [[0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        ...,
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00]],

       [[0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        ...,
        [0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00]],

       [[0.0000000e+00],
        [0.0000000e+00],
        [0.0000000e+00],
        ...,
        [0.0000000

In [11]:
features = []
for index, capuchin in enumerate(file_paths):
    spectrogram, label = preprocess(file_paths[index], index)
    features.append(spectrogram)

In [12]:
features = np.array(features)

In [13]:
print(len(target))
print(len(features))

859
859


In [14]:
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train = np.array(X_train)  # Convert to NumPy array
X_test = np.array(X_test)
y_train = np.array(y_train)  # Convert to NumPy array
y_test = np.array(y_test)
# Print the shapes to verify the split
print(type(X_train))
print(len(y_train))

<class 'numpy.ndarray'>
687


In [15]:
X_train

array([[[[3.1549495e-01],
         [6.5963823e-01],
         [6.1785799e-01],
         ...,
         [9.1588217e-06],
         [7.5227440e-06],
         [6.2584877e-06]],

        [[6.8261616e-02],
         [2.9016855e-01],
         [4.3606080e-02],
         ...,
         [8.9657539e-07],
         [1.0808222e-06],
         [4.2468309e-07]],

        [[2.4150720e-01],
         [1.6682361e-01],
         [1.5326993e-01],
         ...,
         [2.7177390e-07],
         [5.6864087e-07],
         [5.3644180e-07]],

        ...,

        [[0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         ...,
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00]],

        [[0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         ...,
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00]],

        [[0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         ...,
         [0.0000000e+00],
        

In [16]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1491, 257, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Binary classification, so sigmoid activation
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use binary cross-entropy for binary classification
              metrics=['accuracy'])

# Print model summary to see the architecture
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 1489, 255, 32)     320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 744, 127, 32)      0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 742, 125, 64)      18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 371, 62, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 369, 60, 128)      73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 184, 30, 128)      0

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
