In [1]:
import numpy as np 


# trimming and scaling 

In [4]:
import librosa
def trimming_scalling(pos_magnitude):
    magnitude_db = librosa.amplitude_to_db(pos_magnitude)
    mask = magnitude_db > -40
    cleaned_freq = frequencies[mask]
    cleaned_mag = magnitude_db[mask]
    return cleaned_freq,cleaned_mag

# resampling

In [3]:
import soundfile as sf

def resample_audio_file(input_path, output_path, target_rate=16000):
    data, sr = librosa.load(input_path, sr=None)
    if data.ndim > 1:  
                data = data.mean(axis=1)

    if sr != target_rate:
        data_resampled = librosa.resample(data, orig_sr=sr, target_sr=target_rate)
    else:
        data_resampled = data

    sf.write(output_path, data_resampled, target_rate)
    return data_resampled, target_rate

    

# extract frequencies 

In [None]:
def extract_frequencies(record):
      frequencies=np.fft.fft(record)
      frequency_at_each_samples=np.fft.fftfreq(len(frequencies),1/16000)
      magnitude=np.abs(frequencies)
      positive_mask = frequency_at_each_samples >= 0
      pos_freqs = frequency_at_each_samples[positive_mask]
      pos_magnitude = magnitude[positive_mask]
      return pos_freqs,pos_magnitude

# spectral features 

In [None]:
centroid = librosa.feature.spectral_centroid(y=signal, sr=sr) # short fourier >> frames 
bandwidth = librosa.feature.spectral_bandwidth(S=magnitude, sr=sr)
rolloff = librosa.feature.spectral_rolloff(S=magnitude, sr=sr, roll_percent=0.85)
centroid_mean = np.mean(centroid)
bandwidth_mean = np.mean(bandwidth)
rolloff_mean = np.mean(rolloff)


NameError: name 'signal' is not defined

# Pitch features 

https://github.com/michelebersani/YIN_Pitch_Detector/blob/master/2002_JASA_YIN.pdf

# YIN

In [14]:
def auto_correlation_fixed_w_step_1(signal, n_lags_range, window_size):
    signal = np.asarray(signal)
    values_at_different_lags = np.array([
        np.dot(signal[:window_size], signal[lag:lag+window_size])
        for lag in n_lags_range
    ])

    max_index = np.argmax(values_at_different_lags)
    best_lag = n_lags_range[max_index]
    f0 = 16000 / best_lag
    return f0



def difference_function_(signal, window_size, n_lags_range):
    signal = np.asarray(signal)
    differences = [
        np.sum((signal[:window_size - lag] - signal[lag:window_size]) ** 2)
        for lag in n_lags_range
    ]
    return differences


def cumulative_mean_step_3(differences):
    differences = np.asarray(differences, dtype=float)
    cumulative_sums = np.cumsum(differences)
    taus = np.arange(1, len(differences) + 1)
    means = cumulative_sums / taus

    # normalized difference function
    final_array = np.zeros_like(differences, dtype=float)
    final_array[0] = 1.0
    final_array[1:] = differences[1:] / means[1:]

    return final_array



def threshold_step4(cumulative_array, sampling_rate=16000, threshold=0.1):
    cumulative_array = np.asarray(cumulative_array)

    candidates = np.where(cumulative_array < threshold)[0]

    if len(candidates) > 0 and candidates[0] != 0:
        lag = candidates[0]
        f0 = sampling_rate / lag
    else:
        f0 = 0.0

    return f0



def parabolic_interpolation(cumulative_array, lag):
    if lag <= 0 or lag >= len(cumulative_array) - 1:
        return float(lag)  
    y1 = cumulative_array[lag - 1]
    y2 = cumulative_array[lag]
    y3 = cumulative_array[lag + 1]

    denominator = y1 - 2 * y2 + y3
    if denominator == 0:
        return float(lag)  
    delta = 0.5 * (y1 - y3) / denominator
    refined_lag = lag + delta
    return refined_lag


# pitch features 

In [15]:

def extract_pitch_features(signal, window_size=640, hop_size=320, sr=16000, fmin=75, fmax=300, threshold=0.1):
    f0_values = []

    n_lags_range = range(int(sr/fmax), int(sr/fmin))  # e.g. [53, 213] عند sr=16kHz

    for start in range(0, len(signal) - window_size, hop_size):
        frame = signal[start:start + window_size]

        # YIN steps
        diffs = difference_function_(frame, window_size, n_lags_range)
        cmndf = cumulative_mean_step_3(diffs)

        candidates = np.where(cmndf < threshold)[0]
        if len(candidates) == 0 or candidates[0] == 0:
            f0 = 0.0
        else:
            lag = candidates[0]
            refined_lag = parabolic_interpolation(cmndf, lag)
            f0 = sr / refined_lag if refined_lag > 0 else 0.0

        f0_values.append(f0)

    f0_values = np.array(f0_values)
    voiced = f0_values[f0_values > 0]  # ignore unvoiced frames

    if len(voiced) == 0:
        return {
            "f0_mean": 0,
            "f0_median": 0,
            "f0_std": 0,
            "f0_min": 0,
            "f0_max": 0,
            "f0_range": 0,
            "jitter": 0,
            "f0_values": f0_values
        }

    features = {
        "f0_mean": np.mean(voiced),
        "f0_median": np.median(voiced),
        "f0_std": np.std(voiced),
        "f0_min": np.min(voiced),
        "f0_max": np.max(voiced),
        "f0_range": np.max(voiced) - np.min(voiced),
        "jitter": np.std(np.diff(voiced)) / np.mean(voiced) if len(voiced) > 1 else 0,
        "f0_values": f0_values
    }

    return features



# normalization

In [16]:
def normalize_signal(signal):
    min_val = np.min(signal)
    max_val = np.max(signal)
    return (signal - min_val) / (max_val - min_val)

# RMS

In [None]:
rms = librosa.feature.rms(S=magnitude)


# pip line 

In [None]:
import numpy as np
import librosa
import soundfile as sf

# -----------------------------
# Audio preprocessing
# -----------------------------
def load_and_resample(input_path, target_rate=16000):
    data, sr = librosa.load(input_path, sr=None)
    if data.ndim > 1:  # stereo -> mono
        data = data.mean(axis=1)
    if sr != target_rate:
        data_resampled = librosa.resample(data, orig_sr=sr, target_sr=target_rate)
    else:
        data_resampled = data
    sf.write(input_path.replace(".wav","_resampled.wav"), data_resampled, target_rate)
    return data_resampled, target_rate

def normalize_signal(signal):
    min_val = np.min(signal)
    max_val = np.max(signal)
    return (signal - min_val) / (max_val - min_val)

# -----------------------------
# FFT-based extraction
# -----------------------------
def extract_frequencies(record, sr=16000):
    frequencies = np.fft.fft(record)
    freq_at_samples = np.fft.fftfreq(len(frequencies), 1/sr)
    magnitude = np.abs(frequencies)
    positive_mask = freq_at_samples >= 0
    return freq_at_samples[positive_mask], magnitude[positive_mask]

def trimming_scaling(pos_freqs, pos_magnitude, db_threshold=-40):
    magnitude_db = librosa.amplitude_to_db(pos_magnitude)
    mask = magnitude_db > db_threshold
    cleaned_freq = pos_freqs[mask]
    cleaned_mag = magnitude_db[mask]
    return cleaned_freq, cleaned_mag

# -----------------------------
# Pitch (YIN-based)
# -----------------------------
def extract_pitch_features(signal, window_size=640, hop_size=320, sr=16000, fmin=75, fmax=300, threshold=0.1):
    f0_values = []
    n_lags_range = range(int(sr/fmax), int(sr/fmin))

    for start in range(0, len(signal) - window_size, hop_size):
        frame = signal[start:start + window_size]

        # YIN steps
        diffs = difference_function_(frame, window_size, n_lags_range)
        cmndf = cumulative_mean_step_3(diffs)
        candidates = np.where(cmndf < threshold)[0]

        if len(candidates) == 0 or candidates[0] == 0:
            f0 = 0.0
        else:
            lag = candidates[0]
            refined_lag = parabolic_interpolation(cmndf, lag)
            f0 = sr / refined_lag if refined_lag > 0 else 0.0

        f0_values.append(f0)

    f0_values = np.array(f0_values)
    voiced = f0_values[f0_values > 0]

    if len(voiced) == 0:
        return {
            "f0_mean": 0, "f0_median": 0, "f0_std": 0,
            "f0_min": 0, "f0_max": 0, "f0_range": 0,
            "jitter": 0, "f0_values": f0_values
        }

    return {
        "f0_mean": np.mean(voiced),
        "f0_median": np.median(voiced),
        "f0_std": np.std(voiced),
        "f0_min": np.min(voiced),
        "f0_max": np.max(voiced),
        "f0_range": np.max(voiced) - np.min(voiced),
        "jitter": np.std(np.diff(voiced)) / np.mean(voiced) if len(voiced) > 1 else 0,
        "f0_values": f0_values
    }

# -----------------------------
# Spectral features
# -----------------------------
def extract_spectral_features(signal, sr=16000):
    S = np.abs(librosa.stft(signal))

    centroid = librosa.feature.spectral_centroid(S=S, sr=sr)
    bandwidth = librosa.feature.spectral_bandwidth(S=S, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr, roll_percent=0.85)
    rms = librosa.feature.rms(S=S)

    return {
        "centroid_mean": np.mean(centroid),
         "centroid_std": np.std(centroid),
        "bandwidth_mean": np.mean(bandwidth),
        "bandwidth_std": np.std(bandwidth),
        "rolloff_mean": np.mean(rolloff),
        "rolloff_std": np.std(rolloff),
        "rms_mean": np.mean(rms),
         "rms_std": np.std(rms)
    }

# -----------------------------
# Full pipeline
# -----------------------------
def extract_features_pipeline(signal, sr=16000):
    signal_norm = normalize_signal(signal)
    pitch_feats = extract_pitch_features(signal_norm, sr=sr)
    spectral_feats = extract_spectral_features(signal_norm, sr=sr)
    pos_freqs, pos_mag = extract_frequencies(signal_norm, sr=sr)
    trimmed_freqs, trimmed_mag = trimming_scaling(pos_freqs, pos_mag)

    # Combine everything
    features = {**pitch_feats, **spectral_feats}
    features["trimmed_freqs_mean"] = np.mean(trimmed_freqs)
    features["trimmed_mag_mean"] = np.mean(trimmed_mag)

    return features


# data_frames

In [18]:
import os
import pandas as pd
import librosa

base_path = r"C:\Users\arwah\OneDrive\سطح المكتب\FT_IMPLEMENT\men_samples"
all_data = []

for language_folder in os.listdir(base_path):
    language_path = os.path.join(base_path, language_folder)
    
    if os.path.isdir(language_path):
        for file_name in os.listdir(language_path):
            if file_name.endswith(".mp3"):
                file_path = os.path.join(language_path, file_name)
                
                try:
                    # اقرأ MP3 كإشارة رقمية
                    signal, sr = librosa.load(file_path, sr=16000, mono=True)
                    
                    # تمرير الإشارة مباشرة للـ pipeline
                    features = extract_features_pipeline(signal, sr=sr)
                    
                    features["gender"] = 0  # 0 = male
                    features["language"] = language_folder
                    all_data.append(features)
                
                except Exception as e:
                    print(f"Error in file {file_name}: {e}")

df = pd.DataFrame(all_data)
print(df.head())


  final_array[1:] = differences[1:] / means[1:]


      f0_mean   f0_median     f0_std      f0_min      f0_max    f0_range  \
0  208.697515  191.136035  59.565947   74.831310  392.268870  317.437560   
1  203.077481  179.742960  68.804630  120.375620  512.419322  392.043703   
2  207.138455  210.908726  34.571329  136.517771  299.835374  163.317604   
3  218.530566  205.104834  61.436705  124.945710  518.745716  393.800006   
4  210.365572  191.975157  84.308647  125.671600  659.554375  533.882776   

     jitter                                          f0_values  centroid_mean  \
0  0.223773  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...     856.173631   
1  0.226400  [263.6493836001589, 293.7817500045448, 256.723...     749.194761   
2  0.135926  [213.17923558344793, 217.4994461233949, 0.0, 0...     966.370599   
3  0.267326  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 169.02759053269...     876.791859   
4  0.320778  [0.0, 0.0, 0.0, 0.0, 0.0, 231.51746043954697, ...    1048.384110   

   centroid_std  bandwidth_mean  bandwidth_std  rolloff_

In [19]:
df.head()

Unnamed: 0,f0_mean,f0_median,f0_std,f0_min,f0_max,f0_range,jitter,f0_values,centroid_mean,centroid_std,bandwidth_mean,bandwidth_std,rolloff_mean,rolloff_std,rms_mean,rms_std,trimmed_freqs_mean,trimmed_mag_mean,gender,language
0,208.697515,191.136035,59.565947,74.83131,392.26887,317.43756,0.223773,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",856.173631,591.426697,1309.213337,544.061251,2021.490615,1518.010062,0.299261,0.008553,3999.95,23.066875,0,arabic
1,203.077481,179.74296,68.80463,120.37562,512.419322,392.043703,0.2264,"[263.6493836001589, 293.7817500045448, 256.723...",749.194761,523.823874,1245.884559,516.864048,1773.53734,1468.54778,0.305755,0.007609,3999.95,22.282835,0,arabic
2,207.138455,210.908726,34.571329,136.517771,299.835374,163.317604,0.135926,"[213.17923558344793, 217.4994461233949, 0.0, 0...",966.370599,676.401111,1395.386129,507.896292,2275.234625,1545.869176,0.30001,0.009191,3999.95,23.958569,0,arabic
3,218.530566,205.104834,61.436705,124.94571,518.745716,393.800006,0.267326,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 169.02759053269...",876.791859,586.513601,1316.879506,475.988401,2032.772564,1387.352581,0.305391,0.008783,3999.95,23.542638,0,arabic
4,210.365572,191.975157,84.308647,125.6716,659.554375,533.882776,0.320778,"[0.0, 0.0, 0.0, 0.0, 0.0, 231.51746043954697, ...",1048.38411,741.315767,1434.506745,574.516883,2432.333267,1663.075236,0.299776,0.008792,3999.95,24.554248,0,arabic


# data frames women

In [None]:
import os
import pandas as pd
import librosa

base_path = r"C:\Users\arwah\OneDrive\سطح المكتب\FT_IMPLEMENT\women_samples"
all_data_2 = []

for language_folder in os.listdir(base_path):
    language_path = os.path.join(base_path, language_folder)
    
    if os.path.isdir(language_path):
        for file_name in os.listdir(language_path):
            if file_name.endswith(".mp3"):
                file_path = os.path.join(language_path, file_name)
                
                try:
                    signal, sr = librosa.load(file_path, sr=16000, mono=True)
                    
                    features = extract_features_pipeline(signal, sr=sr)
                    
                    features["gender"] = 1  # 0 = male
                    features["language"] = language_folder
                    all_data_2.append(features)
                
                except Exception as e:
                    print(f"Error in file {file_name}: {e}")

df_2 = pd.DataFrame(all_data_2)



  final_array[1:] = differences[1:] / means[1:]
  signal, sr = librosa.load(file_path, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error in file segment_120.mp3: 


In [23]:
df_2.tail()

Unnamed: 0,f0_mean,f0_median,f0_std,f0_min,f0_max,f0_range,jitter,f0_values,centroid_mean,centroid_std,bandwidth_mean,bandwidth_std,rolloff_mean,rolloff_std,rms_mean,rms_std,trimmed_freqs_mean,trimmed_mag_mean,gender,language
546,586.123248,471.745746,332.552622,88.106457,1617.598832,1529.492375,0.384436,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",580.198336,429.476782,1211.933562,537.332641,1246.530551,1358.433024,0.286959,0.005717,3999.95,19.92444,1,german
547,496.301688,430.062103,248.297707,121.211471,1422.729517,1301.518046,0.385865,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 268.61373733021...",643.659602,439.791103,1251.496195,520.882672,1421.151158,1498.897733,0.268581,0.007502,3999.95,20.240731,1,german
548,428.863123,377.422479,197.875767,114.317711,1576.913543,1462.595832,0.434583,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 470.667185...",707.288858,471.365405,1334.626157,539.288105,1659.095447,1677.035314,0.245605,0.009015,3999.95,19.938802,1,german
549,327.963124,321.853447,95.546219,224.786237,541.19905,316.412813,0.156602,"[0.0, 541.1990497848028, 481.112973770676, 428...",1434.673253,400.327263,1628.500431,253.301707,2808.70607,901.083622,0.303248,0.006034,3999.95,23.805932,1,german
550,657.883767,487.192869,373.052879,219.595846,1656.336187,1436.740341,0.508118,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1386.620681,333.365089,1596.231063,181.706548,2718.89976,583.181565,0.287675,0.005964,3999.95,23.717163,1,german
