## Sample Rate Impact Assessment
---

### Objective:
- To assess the impact of different sample rates on the performance of the model.
- To determine the optimal sample rate for the model.

### Methodology:
- A Random Forest model is trained with different features (Chroma, MFCC_30, MFCC_120, CQT_30, CQT_70) and sample rates (mixed, 4000). The features are used 'as extracted' without any additional processing.
- The model is trained with 80% of the data and tested with the remaining 20%.
  
#### Findings:
- 

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(context="paper", font_scale=1)

### Set the Paths

In [6]:
# set the paths
BASE_DIR = '../dataset/'
LABELS = BASE_DIR + 'labels.csv'

# features
FEATURES_BASE = '../features/'
FEATURES = FEATURES_BASE + 'raw/'

### Load the Data and Train the Model

In [7]:
SEED = 42
full_data_dict_keys = ['artifacts', 'extrahls', 'murmurs', 'normals', 'extrastoles']
interval = 1
sample_rates = ['mix', 4000]
num_feat = [30, 120, 12, 30, 70]
type_ = ['mfcc', 'mfcc', 'chroma', 'cqt', 'cqt']

# dataframe where to store the results
results = pd.DataFrame(columns=['mfcc_30', 'mfcc_120', 'chroma_12', 'cqt_30', 'cqt_70'], index=['mix', '4000'])

for sample_rate in sample_rates:
    for i, num_feat in enumerate(num_feat):
         
         # define the features name and load the data
          FEATURES_NAME = f'full_data_{interval}s_{sample_rate}hz_{num_feat}{type_[i]}.npy'
          full_data = np.load(FEATURES + FEATURES_NAME, allow_pickle=True).item()
          
          # extract the data from the dict
          data_list = []
          for key in full_data_dict_keys:
               X = full_data[key]['X']
               y = full_data[key]['y']
               data = np.concatenate((X, y.reshape(-1, 1)), axis=1)
               data_list.append(data)
          full_data_array = np.concatenate(data_list, axis=0)
          
          # split the data into train and test
          X = full_data_array[:, :-1]
          y = full_data_array[:, -1]
          X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
          
          # train the model
          clf = RandomForestClassifier(random_state=SEED)
          clf.fit(X_train, y_train)
          
          # evaluate the model
          score = clf.score(X_test, y_test)
          
          # store the results
          results.iloc[sample_rates.index(sample_rate), i] = score
          
# save the results
results.to_csv('results.csv')

dict_keys(['artifacts', 'extrahls', 'murmurs', 'normals', 'extrastoles'])
