<a href="https://colab.research.google.com/github/DylanCTY/TextAnalyticsProject_Group5/blob/main/Gender_Classification_LR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip Gender.zip

In [None]:
#use  pydub library to provides a simple interface to work with audio files
!pip install pydub

In [None]:
import os
from pydub import AudioSegment

# Function to convert .m4a files to .wav format
def convert_m4a_to_wav(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Loop through each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".m4a"):
            # Construct input and output file paths
            input_file = os.path.join(input_folder, filename)
            output_file = os.path.join(output_folder, os.path.splitext(filename)[0] + ".wav")

            # Load the .m4a file
            audio = AudioSegment.from_file(input_file, format="m4a")

            # Export the audio to .wav format
            audio.export(output_file, format="wav")

# Input folder containing .m4a files
input_folder = "Gender"

# Output folder where .wav files will be saved
output_folder = "Gender_WAV"

# Convert .m4a files to .wav format
convert_m4a_to_wav(input_folder, output_folder)

print("Conversion completed successfully.")

Conversion completed successfully.


Preprocessing:
Resampling the Audio:
Resampling is done to ensure that all audio files have the same sampling rate. This is important for consistency in feature extraction.


In [None]:
import librosa

def resample_audio(audio_file, target_sr=16000):
    y, sr = librosa.load(audio_file, sr=target_sr)
    return y, sr


Normalizing Volume Levels:
Normalizing volume levels ensures that the amplitude of all audio files is within a consistent range.

In [None]:
from pydub import AudioSegment

def normalize_volume(audio_file, target_dBFS=-20):
    sound = AudioSegment.from_wav(audio_file)
    change_in_dBFS = target_dBFS - sound.dBFS
    normalized_sound = sound + change_in_dBFS
    return normalized_sound.export(audio_file, format="wav")


Removing Noise:
Noise removal helps improve the signal-to-noise ratio in the audio, making it easier for the model to extract relevant features.

In [None]:
import numpy as np
import scipy.signal as sg

def remove_noise(audio_file, window_length=21, threshold=0.05):
    y, sr = librosa.load(audio_file)
    # Apply a low-pass filter to remove high-frequency noise
    y_filtered = sg.medfilt(y, kernel_size=window_length)
    # Compute the absolute difference between the original and filtered signals
    diff = np.abs(y - y_filtered)
    # Use a threshold to identify noisy parts
    noisy_indices = np.where(diff > threshold)[0]
    # Replace noisy parts with the filtered signal
    y_cleaned = np.copy(y)
    y_cleaned[noisy_indices] = y_filtered[noisy_indices]
    return y_cleaned, sr


In [None]:
import os

# Define the path to the folder containing audio files
folder_path = "Gender_WAV"

# Iterate over each audio file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        audio_file_path = os.path.join(folder_path, filename)

        # Resample the audio
        y_resampled, sr_resampled = resample_audio(audio_file_path)

        # Normalize volume levels
        normalize_volume(audio_file_path)

        # Remove noise
        y_cleaned, sr_cleaned = remove_noise(audio_file_path)


Statistical Features
1. Means
2. Standard Deviation
3. Maximum Value
4. Minimum Value

In [None]:
import os
import numpy as np
import librosa

# Function to extract MFCC features
def extract_mfcc(audio_file):
    y, sr = librosa.load(audio_file)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return mfccs.T  # Transpose to have features along the columns

# Function to compute statistical features
def compute_statistics(features):
    mean = np.mean(features, axis=0)
    std_dev = np.std(features, axis=0)
    min_val = np.min(features, axis=0)
    max_val = np.max(features, axis=0)
    return mean, std_dev, min_val, max_val

# Directory containing audio files
data_dir = "Gender_WAV"

# List to store data
file_names = []
gender_labels = []
mean_features = []
std_dev_features = []
min_features = []
max_features = []

# Iterate over each audio file in the directory
for filename in os.listdir(data_dir):
    if filename.endswith(".wav"):
        audio_file_path = os.path.join(data_dir, filename)

        # Extract gender label from file name (assuming file name format: <gender>_<ID>.wav)
        gender_label = 1 if "female" in filename else 0

        # Compute statistical features
        y, sr = librosa.load(audio_file_path)
        # Extract statistical features directly from the audio
        mean, std_dev, min_val, max_val = compute_statistics(y)

        # Append data to lists
        file_names.append(filename)
        gender_labels.append(gender_label)
        mean_features.append(mean.tolist())
        std_dev_features.append(std_dev.tolist())
        min_features.append(min_val.tolist())
        max_features.append(max_val.tolist())

# Save data to a text file
with open("audio_data.txt", "w") as file:
    file.write("Name,Gender,Mean,Std Dev,Min,Max\n")
    for i in range(len(file_names)):
        file.write(f"{file_names[i]},{gender_labels[i]},{mean_features[i]},{std_dev_features[i]},{min_features[i]},{max_features[i]}\n")


In [None]:
from sklearn.model_selection import train_test_split

# Load data from the text file
data = np.loadtxt("audio_data.txt", delimiter=",", skiprows=1, dtype=str)

# Extract features and labels
X = data[:, 2:].astype(float)  # Features (mean, std dev, min, max)
y = data[:, 1].astype(int)  # Gender labels

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (60, 4) (60,)
Testing set shape: (40, 4) (40,)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the testing set
y_pred = model.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.475


In [None]:
# Define an expanded hyperparameters grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
    'solver': ['liblinear', 'lbfgs', 'sag', 'newton-cg'],  # Optimization algorithm
    'penalty': ['l2'],  # Regularization penalty compatible with lbfgs, sag, and newton-cg
    'max_iter': [100, 200, 300]  # Maximum number of iterations
}

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 0.5


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict labels for the testing set
y_pred = model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


Evaluation Metrics:
Accuracy: 0.47
Precision: 0.70
Recall: 0.28
F1-Score: 0.40


Accuracy: The proportion of correctly classified samples out of the total number of samples. In this case, the accuracy is 0.47, indicating that 47% of the samples were classified correctly.

Precision: The proportion of true positive predictions out of all positive predictions made by the model. A precision of 0.70 means that out of all samples predicted as positive by the model, 70% were actually positive.

Recall: The proportion of true positive predictions out of all actual positive samples in the dataset. A recall of 0.28 indicates that the model correctly identified 28% of all actual positive samples in the dataset.

F1-Score: The harmonic mean of precision and recall, providing a balance between the two metrics. A higher F1-score indicates better balance between precision and recall. In this case, an F1-score of 0.40 reflects the balance between precision and recall.

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Load data from the text file
data = np.loadtxt("audio_data.txt", delimiter=",", skiprows=1, dtype=str)

# Extract features (mean, std dev, max, min) and labels from the data
features = data[:, 2:].astype(float)
labels = data[:, 1].astype(int)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the features
scaler.fit(features)

# Scale features using the fitted scaler
scaled_features = scaler.transform(features)

# Initialize the logistic regression model
model = LogisticRegression(C=0.1, max_iter=100, penalty='l2', solver='lbfgs')

# Fit the logistic regression model to the scaled features and labels
model.fit(scaled_features, labels)

def predict_gender_from_data(data):
    # Scale features using the fitted scaler
    scaled_data = scaler.transform(data)
    # Make predictions using the trained model
    predicted_gender = model.predict(scaled_data)
    return predicted_gender

# Predict gender using the features from the data
predicted_genders = predict_gender_from_data(features)

# Print the predicted genders
for i in range(len(predicted_genders)):
    if predicted_genders[i] == 1:
        print(f"Audio {data[i, 0]} is a female.")
    else:
        print(f"Audio {data[i, 0]} is a male.")

