# The Mel Scale:
The human ear doesn't perceive sound frequency lineary: at high frequencies, we need larger differences to notice a change.
That's why we use the Mel scale, to compress high frequencies and to stretch low ones and get an approximation of this sound percepetion, thanks to the logarithmic fatcor.
$$Mel(f)=2595\cdot\log_{10}(1+\frac{f}{700}) \qquad\textrm{(approximately)}   $$

That's why, instead of using raw waveforms or spectograms, we convert audio into **Mel spectrograms** or **MFCCs (Mel-Frequency Cepstral Coefficients).**

## Neurons

In [5]:
import math

def sigmoid(x):
    y=1.0/(1+math.exp(-x))
    return y


def activate(inputs,weights):
    #perform net input
    h=0
    for x,w in zip(inputs,weights):
        h+=x*w
    #perform activation
    return sigmoid(h)



inputs=[.5,.3,.2]
weights=[.4,.7,.2]
output=activate(inputs,weights)
print(output)

0.610639233949222


### Multilayer Perception 

In [61]:
import numpy as np
from random import random
#save activations
#implement back propag
#implement gradient descent
#implement train
#train our networks with some dummy dataset
#make some predictions


class MLP:
    
    def __init__(self,num_inputs=3,num_hidden=[3,5],num_outputs=2):
        
        self.num_inputs=num_inputs
        self.num_hidden=num_hidden
        self.num_outputs=num_outputs
        
        
        layers=[self.num_inputs]+self.num_hidden+[self.num_outputs]
        
        #initiate random weights
        weights=[]
        for i in range(len(layers)-1):
            w=np.random.rand(layers[i],layers[i+1])
            weights.append(w)
        self.weights=weights
            
            
            
        activations=[]
        for i in range(len(layers)):
            a=np.zeros(layers[i])
            activations.append(a)
        self.activations=activations
        
        derivatives=[]
        for i in range(len(layers)-1):
            d=np.zeros((layers[i],layers[i+1]))
            derivatives.append(d)
        self.derivatives=derivatives
        
        
        
            
    def forward_propagate(self,inputs):
        
        activations=inputs
        self.activations[0]=activations
        
        
        
        for i,w in enumerate(self.weights):
            #calculate net inputs
            net_inputs=np.dot(activations,w)
            
            #calculate activations
            activations=self._sigmoid(net_inputs)
            self.activations[i+1]=activations
            
        #a_3=s(h_3)
        #h_3=a_2*W_2
            
            
        return activations
    
    
    def back_propagate(self,loss,verbose=False):
        
        #dL/DW_i=(y-a_[i+1])s'(h_[i+1]) a_i
        #s'(h_[i+1])=s(h_[i+1])(1-s(h_[i+1]))
        #s(h_[i+1])=a_[i+1]
        
        #dL/DW_[i-1]=(y-a_[i+1])s'(h_[i+1]) W_i s'(h_i) a[i-1] 
        
        for i in reversed(range(len(self.derivatives))):
            activations=self.activations[i+1]
            delta=loss*self._sigmoid_derivative(activations)
            #-->ndarray([.1,.2])-->ndarray([[.1,.2]])
            delta_reshaped=delta.reshape(delta.shape[0],-1).T
            current_activations=self.activations[i]
            #-->ndarray([.1,.2])-->ndarray([[.1],[.2]])
            current_activations_reshaped=current_activations.reshape(current_activations.shape[0],-1)
        
        
            self.derivatives[i]=np.dot(current_activations_reshaped,delta_reshaped)
            
            loss=np.dot(delta,self.weights[i].T)
            
            if verbose:
                print("Derivatives for W{}: {}".format(i,self.derivatives[i]))
        
        return loss
    
    
    def gradient_descent(self,learning_rate):
        for i in range((len(self.weights))):
            weights=self.weights[i]
            derivatives=self.derivatives[i]
            weights+=derivatives*learning_rate
            
            
    def train(self,inputs,targets,epochs,learning_rate):
        for i in range(epochs):
            sum_loss=0
            for input,target in zip(inputs,targets):
                #perform forward prop
                output=self.forward_propagate(input)

                #calculate errorr
                loss=target-output



                #back_propagations# Backpropagation
                self.back_propagate(loss)

                #apply gardient descent
                self.gradient_descent(learning_rate)
                
                sum_loss+=self._mse(target,output)
                
            #report error
            print(f"Error: {sum_loss/len(inputs)} at epoch {i}")
            
            
            
    def _mse(self,target,output):
        return np.average((target-output)**2)
            
    def _sigmoid_derivative(self,x):
        return x*(1.0-x)
    
    def _sigmoid(self,x):
        return 1/(1+np.exp(-x))


#create some inputs
inputs=np.array([[random()/2 for _ in range(2)] for _ in range(1000)])
#array([[0.1,0.2],[0.3,0.4]])
targets=np.array([[i[0]+i[1]] for i in inputs])
#array([[0.3],[0.7]])


#create an MLP
mlp=MLP(2,[5],1)

#train
mlp.train(inputs,targets,50,0.1)

#create dummy data
input=np.array([0.3,0.1])
target=np.array([0.4])


output=mlp.forward_propagate(input)

print()
print()
print(f"Our network believes that {input[0]}+{input[1]} is equal to {output}")

#print resultsint results
#print(f"The network inputs is: {inputs}")
#print(f"The network output is: {outputs}")

Error: 0.06221365630565667 at epoch 0
Error: 0.04560735361606417 at epoch 1
Error: 0.04495107587582315 at epoch 2
Error: 0.044112565769375806 at epoch 3
Error: 0.04301139031063045 at epoch 4
Error: 0.041562162514908775 at epoch 5
Error: 0.039688375829367496 at epoch 6
Error: 0.03734692422363062 at epoch 7
Error: 0.034557384281067687 at epoch 8
Error: 0.031419357973184035 at epoch 9
Error: 0.028099850378756706 at epoch 10
Error: 0.02479045820111521 at epoch 11
Error: 0.02165750095082367 at epoch 12
Error: 0.018811293502528095 at epoch 13
Error: 0.016302385430870914 at epoch 14
Error: 0.014135239613514837 at epoch 15
Error: 0.012286383601207315 at epoch 16
Error: 0.010719359039228842 at epoch 17
Error: 0.009394416240527616 at epoch 18
Error: 0.008273762473086501 at epoch 19
Error: 0.00732386990671433 at epoch 20
Error: 0.006516113243704141 at epoch 21
Error: 0.005826575917102505 at epoch 22
Error: 0.005235509386843163 at epoch 23
Error: 0.0047266987660795606 at epoch 24
Error: 0.00428685

### Preprocessor

In [None]:
import librosa
import math
import os
import json

DATASET_PATH=r"C:\Users\Ali Sobh\Desktop\Courses\Vipp 201\Data\genres_original"
JSON_PATH=r"C:\Users\Ali Sobh\Desktop\Courses\Vipp 201\JSON_path\mfcc_data.json"
SAMPLE_RATE=22050
DURATION=30 #seconds
SAMPLES_PER_TRACK=SAMPLE_RATE*DURATION

def save_mfcc(dataset_path,json_path,n_mfcc=13,n_fft=2048,hop_length=512,num_segments=5):
    
    #dictionnary to store data
    data={
        "mapping":[],
        "mfcc":[],
        "labels":[]
    }
    
    num_samples_per_segment=int(SAMPLES_PER_TRACK/num_segments)
    expected_num_mfcc_vector_per_segment=math.ceil(num_samples_per_segment/hop_length)
    bad_files=[]
    
    
    #loop through all the genres
    for i, (dirpath,dirnames,filenames) in enumerate(os.walk(dataset_path)):
        
        #ensure that we're not at the root(dataset) level
        if dirpath is not dataset_path:
            
            #save the semantic labels
            dirpath_components=dirpath.split("\\") #genre/blues-> ["genre","blues"]
            semantic_label=dirpath_components[-1]
            data["mapping"].append(semantic_label)
            print(f"\nProcessing {semantic_label}")
            
            #process files for a specific genre
            for f in filenames:
                
                #load audio file
                file_path=os.path.join(dirpath,f)
               
                try:
                    y,sr=librosa.load(file_path,sr=SAMPLE_RATE)
                
                except Exception as e:
                    print(f"Warning: couldn't load {file_path!r}: {type(e).__name__}: {e}")
                    # optionally record bad files to a list for inspection
                    bad_files.append(file_path)
                    continue

                #process segments extracting mfcc and storing data
                for s in range(num_segments):
                    start_sample=num_samples_per_segment*s #s=0->0
                    finish_sample=start_sample+num_samples_per_segment#s=>-num_samples_per_segement
    
                    mfcc=librosa.feature.mfcc(y=y[start_sample:finish_sample],
                                              sr=sr,
                                              n_fft=n_fft,
                                              n_mfcc=n_mfcc,
                                              hop_length=hop_length)
                    
                    mfcc=mfcc.T
                    
                    #store mfcc for segment if it has the expected length
                    if len(mfcc)==expected_num_mfcc_vector_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)

    with open(json_path,"w") as fp:
        json.dump(data,fp,indent=4)
        
        
        
save_mfcc(DATASET_PATH,JSON_PATH,num_segments=10)                

    


Processing blues
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
seg

  y,sr=librosa.load(file_path,sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3
segment 4
segment 5
segment 6
segment 7
segment 8
segment 9
segment 10
segment 1
segment 2
segment 3


### Genre Classifier

In [1]:
import json
from sklearn.model_selection import train_test_split
import numpy as np

DATASET_PATH="mfcc_data.json"

def load_data(dataset_path):
    with open(dataset_path,"r") as fp:
        data=json.load(fp)
    
    #convert lists into numpy arrays
    inputs=np.array(data["mfcc"])
    targets=np.array(data["labels"])
    
    return inputs,targets

#load data
inputs,targets=load_data(DATASET_PATH)
    
#split the data into train and test sets
inputs_train,inputs_test,targets_train,targets_test=train_test_split(inputs,
                                                                     targets,
                                                                     test_size=0.3)

#build the network architechture

#compile network

#train network




import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np


X_train= inputs_train
y_train= targets_train
X_test= inputs_test
y_test= targets_test

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size=32
num_epochs=50
learning_rate=1e-4
num_classes=10  

# Flatten input size
input_dim=X_train.shape[1] * X_train.shape[2]

# create datasets & loaders
train_dataset=TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(y_train, dtype=torch.long))
val_dataset=TensorDataset(torch.tensor(X_test,  dtype=torch.float32),
                              torch.tensor(y_test,  dtype=torch.long))

train_loader=DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader=DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)



class MLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(start_dim=1),            
            nn.Linear(input_dim, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, num_classes)         
        )

    def forward(self, x):
        return self.net(x)

model=MLP(input_dim=input_dim, num_classes=num_classes).to(device)
print(model)


n_params=sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {n_params:,}")

#Loss
criterion=nn.CrossEntropyLoss() 
#Optimizer 
optimizer=optim.Adam(model.parameters(), 
                     lr=learning_rate)


# Training loop(train+validation each epoch)
history={
    "train_loss": [], "train_acc": [],
    "val_loss": [],   "val_acc": []
}

for epoch in range(1,num_epochs+1):
    #training
    model.train()
    running_loss = 0.0
    running_correct = 0
    running_total = 0

    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)                      
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        running_correct += (preds == yb).sum().item()
        running_total += xb.size(0)

    epoch_train_loss = running_loss / running_total
    epoch_train_acc  = running_correct / running_total

    #validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)

            val_loss += loss.item() * xb.size(0)
            preds = logits.argmax(dim=1)
            val_correct += (preds == yb).sum().item()
            val_total += xb.size(0)

    epoch_val_loss = val_loss / val_total
    epoch_val_acc  = val_correct / val_total

    history["train_loss"].append(epoch_train_loss)
    history["train_acc"].append(epoch_train_acc)
    history["val_loss"].append(epoch_val_loss)
    history["val_acc"].append(epoch_val_acc)

    print(f"Epoch {epoch:03d}/{num_epochs} - "
          f"train_loss: {epoch_train_loss:.4f}, train_acc: {epoch_train_acc:.4f} - "
          f"val_loss: {epoch_val_loss:.4f}, val_acc: {epoch_val_acc:.4f}")


MLP(
  (net): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=1690, out_features=512, bias=True)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): ReLU(inplace=True)
    (5): Linear(in_features=256, out_features=64, bias=True)
    (6): ReLU(inplace=True)
    (7): Linear(in_features=64, out_features=10, bias=True)
  )
)
Trainable parameters: 1,014,218
Epoch 001/50 - train_loss: 1.6372, train_acc: 0.4222 - val_loss: 1.4148, val_acc: 0.5023
Epoch 002/50 - train_loss: 1.2668, train_acc: 0.5474 - val_loss: 1.3563, val_acc: 0.5080
Epoch 003/50 - train_loss: 1.0964, train_acc: 0.6090 - val_loss: 1.2058, val_acc: 0.5634
Epoch 004/50 - train_loss: 0.9681, train_acc: 0.6631 - val_loss: 1.2087, val_acc: 0.5674
Epoch 005/50 - train_loss: 0.8460, train_acc: 0.7046 - val_loss: 1.2198, val_acc: 0.5794
Epoch 006/50 - train_loss: 0.7441, train_acc: 0.7439 - val_loss: 1.2816, val_acc: 0.5484
Epoch 007/50 - train_loss: 0.6