In [None]:
# go to content folder
!cd /content

# extract tar.gz file
!tar -xvzf cv-corpus-22.0-delta-2025-06-20-en.tar.gz


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228946.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228947.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228948.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228949.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228955.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228957.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228959.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228960.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228962.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228970.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228971.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228972.mp3
cv-corpus-22.0-delta-2025-06-20/en/clips/common_voice_en_43228973.mp3
cv-corpus-22.0-delta-2025

In [None]:
# Step 1 — Load metadata and inspect files (run this)
import os
import pandas as pd

BASE = "/content/cv-corpus-22.0-delta-2025-06-20/en"
TSV = os.path.join(BASE, "validated.tsv")
CLIPS_DIR = os.path.join(BASE, "clips")

print("TSV path:", TSV, "-> exists:", os.path.exists(TSV))
print("Clips dir:", CLIPS_DIR, "-> exists:", os.path.exists(CLIPS_DIR))
print()

if not os.path.exists(TSV):
    raise FileNotFoundError("validated.tsv not found in expected folder. List files with os.listdir(BASE)")

# load and preview
df = pd.read_csv(TSV, sep="\t")
print("Columns:", df.columns.tolist())
print("Total rows:", len(df))
print("\nGender value counts (including NaN):")
print(df["gender"].value_counts(dropna=False))
print("\nAge value counts (including NaN):")
print(df["age"].value_counts(dropna=False))

print("\nSample rows (path, age, gender):")
print(df[["path","age","gender"]].head(12).to_string(index=False))

# create a filtered dataframe of rows with both gender and age present
df_valid = df[df["gender"].notna() & df["age"].notna()]
print(f"\nRows with both gender & age present: {len(df_valid)}")
print(df_valid[["path","age","gender"]].head(12).to_string(index=False))


TSV path: /content/cv-corpus-22.0-delta-2025-06-20/en/validated.tsv -> exists: True
Clips dir: /content/cv-corpus-22.0-delta-2025-06-20/en/clips -> exists: True

Columns: ['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
Total rows: 170

Gender value counts (including NaN):
gender
NaN                   82
female_feminine       63
male_masculine        23
do_not_wish_to_say     2
Name: count, dtype: int64

Age value counts (including NaN):
age
twenties     45
thirties     40
NaN          32
sixties      21
fourties     17
eighties      6
fifties       6
teens         2
seventies     1
Name: count, dtype: int64

Sample rows (path, age, gender):
                        path      age          gender
common_voice_en_43199993.mp3      NaN             NaN
common_voice_en_42736613.mp3 twenties female_feminine
common_voice_en_42798328.mp3 thirties female_feminine
common_voice_en_43204215.mp3 

In [None]:
# Step 2 — Filter for male voices only and map ages
import numpy as np

# Keep only male voices with age present
df_male = df[(df["gender"] == "male_masculine") & df["age"].notna()].copy()
print("Total male samples with age:", len(df_male))

# Map age buckets to approximate numeric ages
age_map = {
    "teens": 15,
    "twenties": 25,
    "thirties": 35,
    "fourties": 45,
    "fifties": 55,
    "sixties": 65,
    "seventies": 75,
    "eighties": 85
}
df_male["age_num"] = df_male["age"].map(age_map)

print("\nPreview after mapping:")
print(df_male[["path","age","age_num","gender"]].head(12).to_string(index=False))

# Mark senior citizen if age_num > 60
df_male["is_senior"] = df_male["age_num"] > 60
print("\nSenior citizen counts:")
print(df_male["is_senior"].value_counts())


Total male samples with age: 23

Preview after mapping:
                        path      age  age_num         gender
common_voice_en_43193287.mp3  sixties       65 male_masculine
common_voice_en_43200118.mp3 eighties       85 male_masculine
common_voice_en_42816075.mp3 twenties       25 male_masculine
common_voice_en_43406505.mp3  sixties       65 male_masculine
common_voice_en_43160566.mp3  sixties       65 male_masculine
common_voice_en_43173951.mp3  sixties       65 male_masculine
common_voice_en_43004194.mp3 fourties       45 male_masculine
common_voice_en_43004225.mp3 fourties       45 male_masculine
common_voice_en_43004302.mp3 fourties       45 male_masculine
common_voice_en_42706159.mp3  sixties       65 male_masculine
common_voice_en_42706160.mp3  sixties       65 male_masculine
common_voice_en_42706169.mp3  sixties       65 male_masculine

Senior citizen counts:
is_senior
False    13
True     10
Name: count, dtype: int64


In [None]:
# Step 3 — Extract MFCC features from male voices
import librosa
import os
from tqdm import tqdm

clips_dir = "/content/cv-corpus-22.0-delta-2025-06-20/en/clips"

X, y_age, y_senior = [], [], []

for i, row in tqdm(df_male.iterrows(), total=len(df_male)):
    file_path = os.path.join(clips_dir, row["path"])

    if not os.path.exists(file_path):
        print("⚠️ Missing file:", file_path)
        continue

    try:
        # Load audio
        signal, sr = librosa.load(file_path, sr=16000)

        # Extract MFCC (40 coefficients)
        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40)

        # Take mean over time axis → fixed length feature
        mfcc_mean = mfcc.mean(axis=1)

        # Save features and labels
        X.append(mfcc_mean)
        y_age.append(row["age_num"])
        y_senior.append(int(row["is_senior"]))

    except Exception as e:
        print("⚠️ Error with file:", file_path, "->", str(e))

# Convert to arrays
X = np.array(X)
y_age = np.array(y_age)
y_senior = np.array(y_senior)

print("✅ Feature extraction complete!")
print("X shape:", X.shape)
print("y_age shape:", y_age.shape)
print("y_senior shape:", y_senior.shape)


100%|██████████| 23/23 [00:00<00:00, 39.93it/s]

✅ Feature extraction complete!
X shape: (23, 40)
y_age shape: (23,)
y_senior shape: (23,)





In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [None]:
# Dataset locations
BASE = "/content/cv-corpus-22.0-delta-2025-06-20/en"
TSV = os.path.join(BASE, "validated.tsv")
CLIPS = os.path.join(BASE, "clips")

# Load and inspect
df = pd.read_csv(TSV, sep="\t")
print("Columns:", df.columns.tolist())
print("Sample rows:", df[["path", "age", "gender"]].head())

# Filter males with age info
age_map = {
    "teens": 15, "twenties": 25, "thirties": 35, "fourties": 45,
    "fifties": 55, "sixties": 65, "seventies": 75, "eighties": 85
}
df_male = df[(df["gender"] == "male_masculine") & df["age"].notna()].copy()
df_male = df_male[df_male["age"].isin(list(age_map.keys()))]
df_male["age_num"] = df_male["age"].map(age_map)
df_male["is_senior"] = df_male["age_num"] > 60

print("Total male samples w/ age:", len(df_male))
print(df_male[["path", "age", "age_num", "is_senior"]].head())


Columns: ['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
Sample rows:                            path       age           gender
0  common_voice_en_43199993.mp3       NaN              NaN
1  common_voice_en_42736613.mp3  twenties  female_feminine
2  common_voice_en_42798328.mp3  thirties  female_feminine
3  common_voice_en_43204215.mp3       NaN              NaN
4  common_voice_en_42706055.mp3  fourties              NaN
Total male samples w/ age: 23
                            path       age  age_num  is_senior
9   common_voice_en_43193287.mp3   sixties       65       True
12  common_voice_en_43200118.mp3  eighties       85       True
30  common_voice_en_42816075.mp3  twenties       25      False
52  common_voice_en_43406505.mp3   sixties       65       True
74  common_voice_en_43160566.mp3   sixties       65       True


In [None]:
def extract_mfcc(file_path, n_mfcc=40, max_len=40):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = mfcc.mean(axis=1)
    if mfcc_mean.shape[0] < max_len:
        mfcc_mean = np.pad(mfcc_mean, (0, max_len - mfcc_mean.shape[0]))
    else:
        mfcc_mean = mfcc_mean[:max_len]
    return mfcc_mean.reshape((max_len,1)).astype(np.float32)


In [None]:
X, y_age, y_senior = [], [], []
MAX_SAMPLES = None  # Set to e.g. 20 for quick test, or None for all

for i, row in tqdm(df_male.iterrows(), total=len(df_male)):
    if MAX_SAMPLES and i >= MAX_SAMPLES:
        break
    fname = os.path.join(CLIPS, row["path"])
    if not os.path.exists(fname):
        continue
    try:
        feat = extract_mfcc(fname)
        X.append(feat)
        y_age.append(row["age_num"])
        y_senior.append(int(row["is_senior"]))
    except Exception as e:
        print("Skip:", fname, e)

X = np.array(X)
y_age = np.array(y_age)
y_senior = np.array(y_senior)

print("X shape:", X.shape, "age:", y_age.shape, "senior:", y_senior.shape)


100%|██████████| 23/23 [00:00<00:00, 23.85it/s]

X shape: (23, 40, 1) age: (23,) senior: (23,)





In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, Input
from sklearn.model_selection import train_test_split

# Normalize MFCC features per sample
X = X.astype("float32")
X = X / (np.max(np.abs(X), axis=1, keepdims=True) + 1e-8)

# Add channel dimension for CNN input (samples, time, features, channels)
X_cnn = np.expand_dims(X, -1)
print("CNN input shape:", X_cnn.shape)

# Train-test split
X_train, X_test, y_age_train, y_age_test, y_senior_train, y_senior_test = train_test_split(
    X_cnn, y_age, y_senior, test_size=0.2, random_state=42
)

# Build multi-task CNN model
input_shape = X_cnn.shape[1:]  # (max_len, 1, 1)

inputs = Input(shape=input_shape)

x = layers.Conv2D(32, (3,1), activation="relu", padding="same")(inputs)
x = layers.MaxPooling2D((2,1))(x)
x = layers.Conv2D(64, (3,1), activation="relu", padding="same")(x)
x = layers.MaxPooling2D((2,1))(x)
x = layers.Flatten()(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)

# Age regression output
age_output = layers.Dense(1, name="age_output")(x)

# Senior classification output
senior_output = layers.Dense(1, activation="sigmoid", name="senior_output")(x)

# Model
model = models.Model(inputs=inputs, outputs=[age_output, senior_output])

# Compile
model.compile(
    optimizer="adam",
    loss={
        "age_output": "mse",
        "senior_output": "binary_crossentropy"
    },
    metrics={
        "age_output": "mae",
        "senior_output": "accuracy"
    }
)

model.summary()

# Train model
history = model.fit(
    X_train,
    {"age_output": y_age_train, "senior_output": y_senior_train},
    validation_data=(X_test, {"age_output": y_age_test, "senior_output": y_senior_test}),
    epochs=30,
    batch_size=4,
    verbose=1
)

# Evaluate model performance
results = model.evaluate(
    X_test,
    {"age_output": y_age_test, "senior_output": y_senior_test},
    verbose=0
)

print(f"Age Test MAE: {results[3]:.2f} years")
print(f"Senior Detection Test Accuracy: {results[4]*100:.2f}%")


CNN input shape: (23, 40, 1, 1)


Epoch 1/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 759ms/step - age_output_loss: 3014.1548 - age_output_mae: 52.3737 - loss: 3016.9556 - senior_output_accuracy: 0.5359 - senior_output_loss: 0.6929 - val_age_output_loss: 2056.0461 - val_age_output_mae: 48.7804 - val_loss: 2564.0845 - val_senior_output_accuracy: 0.6000 - val_senior_output_loss: 0.6862
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - age_output_loss: 3073.5137 - age_output_mae: 52.3394 - loss: 3063.4280 - senior_output_accuracy: 0.5509 - senior_output_loss: 0.6924 - val_age_output_loss: 2030.2045 - val_age_output_mae: 48.4800 - val_loss: 2534.6875 - val_senior_output_accuracy: 0.6000 - val_senior_output_loss: 0.6729
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - age_output_loss: 3031.1580 - age_output_mae: 51.7927 - loss: 3021.6704 - senior_output_accuracy: 0.5301 - senior_output_loss: 0.6725 - val_age_output_loss: 1982.3560 - 

In [None]:
import random
import librosa
import numpy as np

def augment_audio(y, sr):
    if random.random() < 0.5:
        # Pitch shift between -2 and +2 semitones
        y = librosa.effects.pitch_shift(y, sr, n_steps=random.uniform(-2, 2))
    if random.random() < 0.5:
        # Time stretch between 0.8 and 1.2 speed
        rate = random.uniform(0.8, 1.2)
        y = librosa.effects.time_stretch(y, rate)
    if random.random() < 0.5:
        # Add low-level noise
        noise = 0.005 * np.random.randn(len(y))
        y = y + noise
    return y


In [None]:
def extract_mfcc_seq(file_path, n_mfcc=40, max_len=40):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc = mfcc[:, :max_len]
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0,0), (0, max_len - mfcc.shape[1])), mode='constant')
    return mfcc.T.astype(np.float32)  # Shape: (max_len, n_mfcc)


In [None]:
inputs = layers.Input(shape=(40, 40, 1))

x = layers.Conv2D(32, (3,3), activation='relu', padding='same')(inputs)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2))(x)

x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2))(x)

x = layers.Flatten()(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)

age_output = layers.Dense(1, name='age_output')(x)
senior_output = layers.Dense(1, activation='sigmoid', name='senior_output')(x)

model = tf.keras.Model(inputs=inputs, outputs=[age_output, senior_output])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

callbacks = [
    EarlyStopping(monitor='val_senior_output_accuracy', patience=10, restore_best_weights=True, mode='max'),  # <--- mode='max'
    ReduceLROnPlateau(monitor='val_senior_output_accuracy', factor=0.5, patience=5, min_lr=1e-6, mode='max')  # <--- mode='max'
]

history = model.fit(
    X_train,
    {'age_output': y_age_train, 'senior_output': y_senior_train},
    validation_data=(X_test, {'age_output': y_age_test, 'senior_output': y_senior_test}),
    epochs=100,
    batch_size=4,
    callbacks=callbacks
)


Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - age_output_loss: 2642.6719 - age_output_mae: 49.5131 - loss: 2693.3069 - senior_output_accuracy: 0.4676 - senior_output_loss: 0.7584 - val_age_output_loss: 2060.2598 - val_age_output_mae: 48.8312 - val_loss: 2569.1792 - val_senior_output_accuracy: 0.4000 - val_senior_output_loss: 0.7027 - learning_rate: 0.0010
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - age_output_loss: 2709.4417 - age_output_mae: 50.0870 - loss: 2758.8591 - senior_output_accuracy: 0.4491 - senior_output_loss: 0.6885 - val_age_output_loss: 2050.7656 - val_age_output_mae: 48.7220 - val_loss: 2558.5142 - val_senior_output_accuracy: 0.4000 - val_senior_output_loss: 0.7086 - learning_rate: 0.0010
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - age_output_loss: 2411.5698 - age_output_mae: 46.8917 - loss: 2451.1018 - senior_output_accuracy: 0.3912 - senior_outpu

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(y_senior_train), y=y_senior_train)
class_weights_dict = dict(enumerate(class_weights))

history = model.fit(
    X_train, {'age_output': y_age_train, 'senior_output': y_senior_train},
    validation_data=(X_test, {'age_output': y_age_test, 'senior_output': y_senior_test}),
    epochs=100,
    batch_size=4,
    class_weight={'senior_output': class_weights_dict},
    callbacks=callbacks
)


In [None]:
import numpy as np
import librosa

def extract_mfcc_sequence(file_path, n_mfcc=40, max_len=40):
    """
    Extract MFCC sequence (not mean), pad or truncate to fixed length.
    Output shape: (max_len, n_mfcc)
    """
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0,0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc.T.astype(np.float32)   # transpose to (time, coeff)


In [None]:
from tqdm import tqdm

X_seq, y_age_seq, y_senior_seq = [], [], []

MAX_SAMPLES = None  # set small for testing or None for full data

for i, row in tqdm(enumerate(df_male.itertuples()), total=len(df_male)):
    if MAX_SAMPLES and i >= MAX_SAMPLES:
        break
    file_path = os.path.join(CLIPS, row.path)
    if not os.path.exists(file_path):
        continue
    try:
        feat = extract_mfcc_sequence(file_path, max_len=40)
        X_seq.append(feat)
        y_age_seq.append(row.age_num)
        y_senior_seq.append(int(row.is_senior))
    except Exception as e:
        print(f"Skipping {file_path}: {e}")

X_seq = np.array(X_seq)  # shape (samples, time=40, n_mfcc=40)
y_age_seq = np.array(y_age_seq)
y_senior_seq = np.array(y_senior_seq)

print(f"Features shape: {X_seq.shape}, Age labels: {y_age_seq.shape}, Senior labels: {y_senior_seq.shape}")


100%|██████████| 23/23 [00:01<00:00, 17.78it/s]

Features shape: (23, 40, 40), Age labels: (23,), Senior labels: (23,)





In [None]:
# Add channel dimension for Conv2D input: (samples, time, coeff, 1)
X_seq = X_seq[..., np.newaxis]

# Normalize per sample
X_seq = X_seq.astype(np.float32)
X_seq /= (np.max(np.abs(X_seq), axis=(1,2,3), keepdims=True) + 1e-8)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_shape = X_seq.shape[1:]  # (40, 40, 1)

inputs = layers.Input(shape=input_shape)

x = layers.Conv2D(64, (3,3), padding='same', activation='relu')(inputs)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2))(x)  # output: 20x20x64

x = layers.Conv2D(128, (3,3), padding='same', activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2))(x)  # output: 10x10x128

x = layers.Conv2D(256, (3,3), padding='same', activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D((2,2))(x)  # output: 5x5x256

x = layers.GlobalAveragePooling2D()(x)

x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)

# Age regression output
age_output = layers.Dense(1, name='age_output')(x)

# Senior classification output
senior_output = layers.Dense(1, activation='sigmoid', name='senior_output')(x)

model = models.Model(inputs=inputs, outputs=[age_output, senior_output])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss={
        'age_output': 'mse',
        'senior_output': 'binary_crossentropy'
    },
    metrics={
        'age_output': ['mae'],
        'senior_output': ['accuracy']
    }
)

model.summary()


In [None]:
from sklearn.utils import class_weight
import numpy as np

# Compute class weights on training data
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_senior_train),
    y=y_senior_train
)
class_weight_dict = dict(enumerate(class_weights))


In [None]:
# Create sample weights array for senior output only
sample_weights_senior = np.array([class_weight_dict[label] for label in y_senior_train], dtype=np.float32)
sample_weights_age = np.ones_like(y_age_train, dtype=np.float32)

sample_weights = {
    'age_output': sample_weights_age,
    'senior_output': sample_weights_senior
}


In [None]:
# Verify model output names
print("Model outputs:", model.output_names)

# Define sample weights properly to avoid errors
sample_weights = {
    'age_output': np.ones_like(y_age_train, dtype=np.float32),
    'senior_output': np.array([class_weight_dict[label] for label in y_senior_train], dtype=np.float32)
}

# Use callbacks as defined earlier (with EarlyStopping, ReduceLROnPlateau)
history = model.fit(
    X_train,
    {'age_output': y_age_train, 'senior_output': y_senior_train},
    validation_data=(X_val, {'age_output': y_age_val, 'senior_output': y_senior_val}),
    sample_weight=sample_weights,
    epochs=100,
    batch_size=4,
    callbacks=callbacks,
    verbose=1
)


Model outputs: ListWrapper(['age_output', 'senior_output'])
Epoch 1/100


ValueError: Attr 'Toutput_types' of 'OptionalFromValue' Op passed list of length 0 less than minimum 1.