In [None]:
import pandas as pd
path = "C:/Users/shrra/Downloads/2024-06-25T13-17_export.csv"
datafram =pd.read_csv(path)



def determine_new_race_5(row):
    if row['race___5'] and row['ethnicity'] == "Hispanic or Latino":
        return False
    else:
        return row['race___5']

# Apply the function to create the new column
datafram['new_race___5'] = datafram.apply(determine_new_race_5, axis=1)

# Remove rows where race___5 is false and race___8 is true
datafram = datafram[~((datafram['race___5'] == False) & (datafram['race___8'] == True))]

# race_columns = ['race___1', 'race___2', 'race___3', 'race___4', 'race___5','race___6', 'race___7','race___8','new_race___5']  
# race_counts = datafram[race_columns].sum()
# plt.figure(figsize=(10, 6))
# sns.barplot(x=race_counts.index, y=race_counts.values, palette='viridis')
# plt.xlabel('Race')
# plt.ylabel('Count')
# plt.title('Number of Counts Available for Each Race')
# plt.show()


# income_counts = datafram['household_income_usa'].value_counts()

# # Plotting the counts
# plt.figure(figsize=(10, 6))
# sns.barplot(x=income_counts.index, y=income_counts.values, palette='viridis')
# plt.xlabel('Household Income')
# plt.ylabel('Count')
# plt.title('Number of Counts Available for Each Household Income Bracket')
# plt.show()

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import librosa
import numpy as np
import matplotlib.pyplot as plt

# Custom Dataset with Spectrogram Conversion and Padding
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, max_len=10000, n_fft=2048, hop_length=512, n_mels=128):
        self.file_paths = file_paths
        self.labels = labels
        self.max_len = max_len
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        spectrogram = self.wav_to_spectrogram(file_path)
        spectrogram = self.pad_spectrogram(spectrogram, self.max_len)
        label = self.labels.iloc[idx]
        return torch.tensor(spectrogram, dtype=torch.float32).unsqueeze(0), torch.tensor(label, dtype=torch.long)

    def wav_to_spectrogram(self, file_path):
        y, sr = librosa.load(file_path, sr=None)
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels)
        S_db = librosa.power_to_db(S, ref=np.max)
        return S_db

    def pad_spectrogram(self, spec, max_len):
        if spec.shape[1] < max_len:
            pad_width = max_len - spec.shape[1]
            spec = np.pad(spec, ((0, 0), (0, pad_width)), mode='constant')
        else:
            spec = spec[:, :max_len]
        return spec

# Load the DataFrame
df = datafram

selected_columns = ['record_id', 'demographics_session_id', 'new_race___5']
df = df[selected_columns]

df['subject_id'] = 'sub-' + df['record_id'] + '/ses-' + df['demographics_session_id']

# Split the DataFrame into train, validation, and test sets
train_ids, test_ids = train_test_split(df['subject_id'], test_size=0.2, random_state=42)
train_ids, val_ids = train_test_split(train_ids, test_size=0.2, random_state=42)

def get_file_paths(subject_ids, base_dir='C:/Users/shrra/Downloads/Bridge2AI/bridge2ai-voice-corpus-2-including-sensitive-recordings1/bids_with_sensitive_recordings/'):
    file_paths = []
    valid_ids = []
    for subject_id in subject_ids:
        session_dir = os.path.join(base_dir, subject_id, 'audio')
        found_file = False
        if os.path.exists(session_dir):
            for file_name in os.listdir(session_dir):
                if file_name.endswith('Rainbow-Passage_rec-Rainbow-Passage.wav'):  # Check for .wav files
                    file_paths.append(os.path.join(session_dir, file_name))
                    valid_ids.append(subject_id)
                    found_file = True
                    break 
        if not found_file:
            print(f"Missing file for subject_id: {subject_id}")
    return file_paths, valid_ids

train_file_paths, train_valid_ids = get_file_paths(train_ids)
val_file_paths, val_valid_ids = get_file_paths(val_ids)
test_file_paths, test_valid_ids = get_file_paths(test_ids)

train_df = df[df['subject_id'].isin(train_valid_ids)]
val_df = df[df['subject_id'].isin(val_valid_ids)]
test_df = df[df['subject_id'].isin(test_valid_ids)]

# Determine the maximum length of the spectrograms for padding
max_len = 16000*30
for file_path in train_file_paths + val_file_paths + test_file_paths:
    y, sr = librosa.load(file_path, sr=None)
    S = librosa.feature.melspectrogram(y=y, sr=sr)
    max_len = max(max_len, S.shape[1])

print(f"Maximum spectrogram length: {max_len}")

# Create datasets
train_dataset = AudioDataset(train_file_paths, train_df['new_race___5'], max_len=max_len)
val_dataset = AudioDataset(val_file_paths, val_df['new_race___5'], max_len=max_len)
test_dataset = AudioDataset(test_file_paths, test_df['new_race___5'], max_len=max_len)

# Create dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define CNN model
class SimpleCNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * (input_size[0] // 4) * (input_size[1] // 4), 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize model, loss function, and optimizer
input_size = (128, max_len)  # Adjust based on the shape of your spectrogram
num_classes = 2  # Adjust based on your labels
model = SimpleCNN(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

    # Validation loop
    model.eval()
    val_running_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    
    val_accuracy = val_correct / val_total
    print(f'Validation Accuracy: {val_accuracy:.4f}, Loss: {val_running_loss/len(val_loader):.4f}')

# Test loop
model.eval()
test_correct = 0
test_total = 0
y_true = []
y_pred = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

test_accuracy = test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.4f}')

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap=plt.cm.Blues)
plt.show()
