In [4]:
import os
import librosa
import pandas as pd

# Function to extract features from an audio file
def extract_features(file_path):
    # Example: Use librosa to extract MFCC features
    audio, sr = librosa.load(file_path)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

    # Example: Calculate mean of MFCCs
    mfccs_mean = mfccs.mean(axis=1)

    return mfccs_mean

# Directory path where your dataset is located
dataset_path = '../data/audio'

# Initialize lists to store data
data = []
labels = []

# Iterate through each class folder
for class_folder in os.listdir(dataset_path):
    class_path = os.path.join(dataset_path, class_folder)

    # Iterate through each audio file in the class folder
    for audio_file in os.listdir(class_path):
        audio_path = os.path.join(class_path, audio_file)

        # Extract features from the audio file
        features = extract_features(audio_path)

        # Append features and corresponding label to the lists
        data.append(features)
        labels.append(class_folder)

# Create a DataFrame from the lists
df = pd.DataFrame(data, columns=[f'mfcc_{i}' for i in range(len(data[0]))])
df['label'] = labels


In [5]:
df.head()

Unnamed: 0,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,label
0,-360.230408,87.481895,46.414539,24.77965,18.021656,21.886494,3.408688,5.293712,5.115957,-6.162708,-2.310038,-1.076668,-4.642399,news
1,-356.726532,68.121689,-3.132943,20.798172,4.997914,-17.470881,0.045309,5.603902,-25.335793,-8.190118,-11.27253,-9.935472,0.809734,news
2,-1131.37085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,news
3,-314.654541,76.141998,16.070663,9.617281,0.760552,-2.055876,9.702311,-11.428221,-13.339287,-3.256329,-15.668971,0.882981,-11.925661,news
4,-350.341522,60.137329,-1.744141,17.218365,-19.416693,-5.951996,2.844842,-13.331096,-7.217333,-2.352494,-14.687024,-2.680267,-14.142367,news


In [6]:
from sklearn.model_selection import train_test_split

# Data is in a DataFrame df with columns as features and a 'label' column for the class labels
X = df.drop(columns=['label'])
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


# Create a pipeline with a classifier (Random Forest) and a scaler (StandardScaler)
model = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100))

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8571428571428571


In [11]:
import joblib

# Assuming 'model' is your trained classifier
tabular_model = model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(tabular_model, '../app/models/tabular_model.joblib')

['../app/models/tabular_model.joblib']

In [13]:
audio, sr = librosa.load('../data/audio/today/mp3-output-ttsfree(dot)com (1).mp3')
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

# Example: Calculate mean of MFCCs
mfccs_mean = [mfccs.mean(axis=1)]

features = pd.DataFrame(mfccs_mean, columns=[f'mfcc_{i}' for i in range(len(mfccs_mean[0]))])

loaded_tabular_model = joblib.load('../app/models/tabular_model.joblib')

predictions = loaded_tabular_model.predict(features)

predictions[0]

'today'

: 