In [3]:
!pip install pandas
!pip install numpy
!pip install torch torchvision
!pip install scikit-learn


import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#WAYS TO FINETUNE
#Learning rate, batch size, number of  epochs, different optimizer  functions different dropout rates, different architecture.


#This cell is going to be just for data loading and Preprocessing
positive_df = pd.read_csv('positive_dataset_2.csv')
negative_df = pd.read_csv('negative_dataset_2.csv')
cat2_df = pd.read_csv('cat2_master_.csv')
cat3_df = pd.read_csv('cat3_master_.csv')


positive_df['label'] = 1 #buzz present
negative_df['label'] = 0 #buzz not present (import for before-hand shuffling)

#PREVIOUS  IMPLEMENTATION
# Combine and shuffle to learn from both DS and model without structure
# combined_df = pd.concat([positive_df, negative_df], axis=0)
# combined_df = combined_df.sample(frac=1).reset_index(drop=True)


# Ensure the datasets are aligned by a common identifier: ASK MARIE HOW WE ARE GOING TO IDENTITY INVASIVE VS NONINVASE IN HER SCRIPT
# # Assuming all datasets have an 'id' column for alignment
# positive_df = positive_df.merge(cat2_df, on='id', how='left')
# negative_df = negative_df.merge(cat3_df, on='id', how='left')

#IN THE MEANTIME USE TRUNCATION
# min_len_pos = min(len(positive_df), len(cat2_df))
# min_len_neg = min(len(negative_df), len(cat3_df))

# positive_df = positive_df.iloc[:min_len_pos].reset_index(drop=True)
# cat2_df = cat2_df.iloc[:min_len_pos].reset_index(drop=True)

# negative_df = negative_df.iloc[:min_len_neg].reset_index(drop=True)
# cat3_df = cat3_df.iloc[:min_len_neg].reset_index(drop=True)

# # Merge datasets based on their index positions
# positive_df = positive_df.join(cat2_df, rsuffix='_cat2')
# negative_df = negative_df.join(cat3_df, rsuffix='_cat3')

#NEW APPROCH
min_length = min(len(positive_df), len(negative_df), len(cat2_df), len(cat3_df))
# Truncate each dataset to the minimum length
positive_df = positive_df.iloc[:min_length]
negative_df = negative_df.iloc[:min_length]
cat2_df = cat2_df.iloc[:min_length]
cat3_df = cat3_df.iloc[:min_length]
positive_df = positive_df.reset_index(drop=True).join(cat2_df.reset_index(drop=True), rsuffix='_cat2')
negative_df = negative_df.reset_index(drop=True).join(cat3_df.reset_index(drop=True), rsuffix='_cat3')

# Handle any missing values that may have resulted from the merge
positive_df.fillna(0, inplace=True)
negative_df.fillna(0, inplace=True)

# Combine and shuffle datasets
combined_df = pd.concat([positive_df, negative_df], axis=0)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df = combined_df.apply(pd.to_numeric, errors='coerce')

# Extract features and labels, store in Numpyarray
X = combined_df.drop(columns=['label']).values
y = combined_df['label'].values

# Reshape X to add a channel dimension for the number of channels parameter(required by 1D CNN)
X = np.expand_dims(X, axis=2)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (746, 77, 1)
Shape of y: (746,)


In [4]:
#This cell will be  responsible for  fitting the  data  into acceptable pytorch formats

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

dataset = TensorDataset(X_tensor, y_tensor)  #we need a dataset that stores the tensors for pytorch to work with it

# Spliting
train_size = int(0.8 * len(dataset)) #using an 80-20 training-testing split
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size]) #split the model randomly which helps with model generalization

# Create DataLoader objects
batch_size = 128 # Consider changing this to a  stochastic mode
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print("DataLoader objects created successfully!")

DataLoader objects created successfully!


In [8]:
#This cell will be responsible for the architecture of the 1D CNN model
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(p=0.2)
        self.flatten = nn.Flatten()

        # Calculate the size after conv2 and pool layers
        self._to_linear = None
        self.convs(torch.randn(1, 1, 77))

        self.fc1 = nn.Linear(self._to_linear, 128)  # Adjust input size based on the output of conv2/pool
        self.fc2 = nn.Linear(128, 1)

    def convs(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        print(f"After conv1, bn1 and pool: {x.shape}")
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        print(f"After conv2, bn2 and pool: {x.shape}")
        if self._to_linear is None:
            self._to_linear = x[0].numel()
        return x

    def forward(self, x):
        x = self.convs(x)
        x = self.dropout(x)
        x = self.flatten(x)
        print(f"After flatten: {x.shape}")
        x = F.relu(self.fc1(x))
        print(f"After fc1: {x.shape}")
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))  # Ensure sigmoid activation for output between 0 and 1
        print(f"Final output: {x}")  # Ensure sigmoid activation for output between 0 and 1
        return x

    def initialize_weights(m):
        if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
                model.apply(initialize_weights)

model = CNN1D()

After conv1, bn1 and pool: torch.Size([1, 64, 37])
After conv2, bn2 and pool: torch.Size([1, 128, 17])


In [9]:
#This cell will be responsible for training validation and testing
# Define loss function and optimizer
criterion = nn.BCELoss() #I Used the Binray cross-entropy loss function to give our binary classification task
optimizer = optim.Adam(model.parameters(), lr=0.0001) #think about change from the Adam optimizer and learning rate

# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0 #could chnage this basd on actual loss per epoch
        for inputs, labels in train_loader:
            inputs = inputs.permute(0, 2, 1)  # Change shape to (batch_size, channels, sequence_length)
            optimizer.zero_grad()
            outputs = model(inputs)

            if torch.any(outputs < 0) or torch.any(outputs > 1):
                print(f"Epoch {epoch + 1}: Model output out of range")
                print(f"Outputs: {outputs}")
                raise ValueError(f"Model output out of range: {outputs}")

            loss = criterion(outputs.squeeze(), labels) #remove dim of size 1 with squeeze
            loss.backward() #gradient computation
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            running_loss += loss.item() #aacumlaite loss for each step
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():   #takes out gradadient compauition becavsue we dont need it and can save on memeroy soace
        for inputs, labels in test_loader:
            inputs = inputs.permute(0, 2, 1)  # Change shape to (batch_size, channels, sequence_length)
            outputs = model(inputs)

            if torch.any(outputs < 0) or torch.any(outputs > 1):
                raise ValueError(f"Model output out of range: {outputs}")

            predicted = (outputs.squeeze() > 0.5).float() #setting the threshold to 0.5 for binary classification or we could make this aan or statement
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=50)

# Evaluate the model
evaluate_model(model, test_loader)

After conv1, bn1 and pool: torch.Size([128, 64, 37])
After conv2, bn2 and pool: torch.Size([128, 128, 17])
After flatten: torch.Size([128, 2176])
After fc1: torch.Size([128, 128])
Final output: tensor([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [na

RuntimeError: all elements of input should be between 0 and 1