In [1]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import librosa
import librosa.display
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Dropout
from keras.utils import to_categorical
from keras.optimizers import Adam

In [4]:
# Load CSV
file_path = '/kaggle/input/common-voice/cv-valid-train.csv'
dftrain = pd.read_csv(file_path)

# Drop rows with missing gender
dftrain = dftrain.dropna(subset=['gender'])

# Drop unused 'duration' column (100% NaN)
dftrain = dftrain.drop('duration', axis=1)

# Filter accents of interest
valid_accents = ['african', 'england', 'indian', 'newzealand', 'scotland', 'us']
dftrain = dftrain[dftrain['accent'].isin(valid_accents)]

# Drop rows with missing accent
dftrain = dftrain.dropna(subset=['accent'])

# Balance the dataset to have 1000 samples per accent (except African: 1003 originally)
from numpy.random import choice

def drop_random_samples(df, accent, n):
    indices = df[df['accent'] == accent].index
    drop_indices = choice(indices, size=n, replace=False)
    return df.drop(drop_indices)

# Adjust to 1000 samples per accent
accent_counts = dftrain['accent'].value_counts()
for accent in accent_counts.index:
    current_count = accent_counts[accent]
    if current_count > 1000:
        dftrain = drop_random_samples(dftrain, accent, current_count - 1000)

# Final counts after balancing
print(dftrain['accent'].value_counts())

# Drop 'us' accent
dftrain = dftrain[dftrain['accent'] != 'us']

# Balance gender: drop excess male entries to balance with female count
gender_counts = dftrain['gender'].value_counts()
df_male = dftrain[dftrain['gender'] == 'male']
df_female = dftrain[dftrain['gender'] == 'female']
excess_male_count = len(df_male) - len(df_female)
df_male = df_male.drop(df_male.index[:excess_male_count])

dftrain = pd.concat([df_male, df_female])

# Encode accent labels
label_encoder = LabelEncoder()
dftrain['accent_encoded'] = label_encoder.fit_transform(dftrain['accent'])

# Extract MFCCs
AUDIO_FOLDER = '/kaggle/input/common-voice/cv-valid-train'

def extract_mfcc(file_path, n_mfcc=20, hop_length=256, n_fft=4096):
    audio, sr = librosa.load(file_path)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
    return mfccs

def generate_mfcc_batches(df, batch_size=32):
    for start in range(0, len(df), batch_size):
        batch = df.iloc[start:start+batch_size]
        batch_mfccs = []
        for _, row in batch.iterrows():
            path = os.path.join(AUDIO_FOLDER, row['filename'])
            mfccs = extract_mfcc(path)
            batch_mfccs.append(mfccs)
        yield batch_mfccs

# Process all MFCCs
all_mfccs = []
max_time_frames = 0

for batch in generate_mfcc_batches(dftrain):
    for mfcc in batch:
        all_mfccs.append(mfcc)
        max_time_frames = max(max_time_frames, mfcc.shape[1])

gc.collect()

# Pad MFCCs
padded_mfccs = []
for mfcc in all_mfccs:
    if mfcc.shape[1] < max_time_frames:
        padded = np.pad(mfcc, ((0, 0), (0, max_time_frames - mfcc.shape[1])), mode='constant')
    else:
        padded = mfcc[:, :max_time_frames]
    padded_mfccs.append(padded)

X = np.stack(padded_mfccs)
y = to_categorical(dftrain['accent_encoded'].values)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/common-voice/cv-valid-train.csv'

In [None]:
# Build CNN model
input_shape = (X.shape[1], X.shape[2], 1)
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=input_shape, kernel_regularizer=l2(0.1)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.1)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.1)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.1)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.1)),
    BatchNormalization(),

    Flatten(),
    Dense(512, activation='relu', kernel_regularizer=l2(0.1)),
    Dropout(0.5),
    Dense(256, activation='relu', kernel_regularizer=l2(0.1)),
    Dropout(0.4),
    Dense(128, activation='relu', kernel_regularizer=l2(0.1)),
    Dropout(0.4),
    Dense(64, activation='relu', kernel_regularizer=l2(0.1)),
    Dropout(0.4),
    Dense(y.shape[1], activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=1e-4), metrics=['accuracy'])
model.summary()



In [None]:
# Train model
history = model.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1), y_train,
                    validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1), y_test),
                    epochs=60, batch_size=32)

# Evaluate model
accuracy = model.evaluate(X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1), y_test)[1]
print("Final Accuracy on Test Set:", accuracy)

# Confusion Matrix
y_pred = model.predict(X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1))
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test, axis=1)

print(classification_report(y_true_labels, y_pred_labels, target_names=label_encoder.classes_))

cm = confusion_matrix(y_true_labels, y_pred_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Accuracy & Loss plots
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Save model
model.save('model_mfcc.h5')

# Provide download link (Kaggle environment only)
from IPython.display import FileLink
FileLink('model_mfcc.h5')