# Training notebook 

* Design a pipeline to load individual videos and stream them to model in 3 second chunks 
    * load a file. split the audio file into 960 ms clips 
    * once we are out of clips we load another file
* Import VGGish for embedding 1 second chunks
* Stream data to the model and train LSTM + feed-forward 
* Visualize loss before hyperparameter tuning 

In [58]:
import torch
from torch import nn, optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [137]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print("Number of class 1 examples:", train[train['class'] == 1].shape[0])
print("Number of class 0 examples:", train[train['class'] == 0].shape[0])

total_length_class_1 = train[train['class'] == 1]['length (minutes)'].sum()
total_length_class_0 = train[train['class'] == 0]['length (minutes)'].sum()

print("Total length of class 1 examples:", total_length_class_1)
print("Total length of class 0 examples:", total_length_class_0)

Number of class 1 examples: 76
Number of class 0 examples: 23
Total length of class 1 examples: 388.9250442176871
Total length of class 0 examples: 586.8637993197277


In [60]:
import torchaudio
from torchaudio.prototype.pipelines import VGGISH

# Get the pre-trained VGGish model
vggish = VGGISH.get_model()

print(vggish)

  return torch.load(path)


VGGish(
  (features_network): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mo

In [61]:
class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.5):
        super(SentimentLSTM, self).__init__()
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_size, 32)
        self.fc2 = nn.Linear(32, 8)
        self.fc3 = nn.Linear(8, output_size)
        
        # Activation function
        self.activation = nn.LeakyReLU()
        self.final_activation = nn.Sigmoid()

    def forward(self, x):
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)
        
        # Take the output from the last time step
        last_output = lstm_out[:, -1, :]
        
        # Pass through fully connected layers with activation
        x = self.fc1(last_output)
        x = self.activation(x)
        x = self.fc2(x)
        x = self.activation(x)
        x = self.fc3(x)
        
        # Apply final activation
        return self.final_activation(x)

In [62]:
# create model
input_size = 128    # Output size of VGGish features
hidden_size = 128   # hidden state size of the LSTM
num_layers = 2      # Number of stacked LSTM layers
output_size = 1     # Binary classification (positive/negative sentiment)
dropout = 0.3       # Dropout for regularization

model = SentimentLSTM(input_size, hidden_size, num_layers, output_size, dropout=dropout)

In [119]:
import numpy as np

def extract_features(audio_path, vggish, device="cpu"):
    sampler = torchaudio.transforms.Resample(orig_freq=44100, new_freq=16000).to(device)
    spectrogram_converter = torchaudio.transforms.MelSpectrogram(
        sample_rate=16000,
        n_mels=64,        # Matches VGGish input
        n_fft=400,        # 25 ms window size
        hop_length=160,   # 10 ms hop size
        f_min=125,        # Minimum frequency for mel filter bank
        f_max=7500        # Maximum frequency for mel filter bank
    ).to(device)

    # Load audio file
    audio_data, sr = torchaudio.load(audio_path)
    audio_data = sampler(audio_data)  # Resample to 16 kHz
    audio_data= spectrogram_converter(audio_data)  # Log mel spectrogram
    audio_data = audio_data.mean(dim=0, keepdim=True) # convert to mono

    # Extract VGGish features
    features = []
    for sample in audio_data.split(96, dim=2):
        if sample.size(2) < 96:
            padding = 96 - sample.size(2)
            sample = torch.nn.functional.pad(sample, (0, padding), mode='constant', value=0)
        sample = sample.unsqueeze(1).to(device)
        features.append(vggish(sample.to(device)).squeeze(0))
        
    print("extracted features for", audio_path)
    return features

In [127]:
def training_loop(epochs, train, model, batch_size=32, device="cpu"):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_func = nn.BCELoss()

    train_losses = []

    for epoch in range(epochs):
        total_loss = 0.0
        sample_count = 0
        batch_features = []
        batch_labels = []

        for idx, row in train.iterrows():
            # Extract features and label
            features = extract_features(row["path"], vggish, device)
            label = torch.tensor([row["class"]], dtype=torch.float32).to(device)

            for feature in features:
                # Accumulate features and labels for batching
                batch_features.append(feature.unsqueeze(0).unsqueeze(0))
                batch_labels.append(label)

                # If we reach batch size, process the batch
                if len(batch_features) == batch_size:
                    batch_features = torch.cat(batch_features).to(device)
                    batch_labels = torch.cat(batch_labels).unsqueeze(1).to(device)

                    # Compute predictions and loss
                    predictions = model(batch_features)
                    loss = loss_func(predictions, batch_labels)

                    # Update model
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    total_loss += loss.item() * batch_size
                    sample_count += batch_size

                    # Reset batch
                    batch_features = []
                    batch_labels = []


                # Handle leftover features in the last batch
                if batch_features:
                    batch_features = torch.cat(batch_features).to(device)
                    batch_labels = torch.cat(batch_labels).unsqueeze(1).to(device)

                    predictions = model(batch_features)
                    loss = loss_func(predictions, batch_labels)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    total_loss += loss.item() * len(batch_features)
                    sample_count += len(batch_features)

                    # Reset batch
                    batch_features = []
                    batch_labels = []

                avg_loss = total_loss / sample_count if sample_count > 0 else 0
                print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.6f}")
                train_losses.append(avg_loss)

                if idx > 0:
                    break  # only train on one example for now

    return train_losses


In [128]:
train_losses = training_loop(1, train, model)
print(train_losses)

extracted features for data/dangerous/ANOTHER FIGHT ON THE 2 TRAIN NYC SMFH LMFAO! @SHOTIMETV ON IG.wav
Epoch 1/1, Loss: 12.959568
Epoch 1/1, Loss: 12.833080
Epoch 1/1, Loss: 12.623137
Epoch 1/1, Loss: 12.419320
Epoch 1/1, Loss: 12.286698
Epoch 1/1, Loss: 12.143320
Epoch 1/1, Loss: 11.984709
Epoch 1/1, Loss: 11.806667
Epoch 1/1, Loss: 11.655394
Epoch 1/1, Loss: 11.417984
Epoch 1/1, Loss: 11.245034
Epoch 1/1, Loss: 11.094389
Epoch 1/1, Loss: 10.943359
Epoch 1/1, Loss: 10.793807
Epoch 1/1, Loss: 10.654155
Epoch 1/1, Loss: 10.508861
Epoch 1/1, Loss: 10.347005
Epoch 1/1, Loss: 10.175807
Epoch 1/1, Loss: 10.018629
Epoch 1/1, Loss: 9.816680
Epoch 1/1, Loss: 9.641315
Epoch 1/1, Loss: 9.488691
Epoch 1/1, Loss: 9.330985
Epoch 1/1, Loss: 9.171708
Epoch 1/1, Loss: 8.991265
Epoch 1/1, Loss: 8.817975
Epoch 1/1, Loss: 8.649133
Epoch 1/1, Loss: 8.488380
Epoch 1/1, Loss: 8.318951
Epoch 1/1, Loss: 8.154168
Epoch 1/1, Loss: 7.993641
Epoch 1/1, Loss: 7.808304
Epoch 1/1, Loss: 7.633723
Epoch 1/1, Loss: 7.

KeyboardInterrupt: 