In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Create label mapping
label_mapping = {
    'ChatGPT': 0,
    'Reddit': 1,
    'Wikipedia': 2,
    'LinkedIn': 3
}

# Convert label column using mapping
train_df.iloc[:, -1] = train_df.iloc[:, -1].map(label_mapping)
test_df.iloc[:, -1] = test_df.iloc[:, -1].map(label_mapping)

# Check for any NaN values in labels 
print(f"NaN values in train labels: {train_df.iloc[:, -1].isna().sum()}")
print(f"NaN values in test labels: {test_df.iloc[:, -1].isna().sum()}")

# Convert feature columns to numeric and handle NaN values
for col in train_df.columns[:-1]:  # exclude label column
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

# Fill NaN values with column means instead of dropping rows
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

# Verify data types before conversion to tensors
print(f"Data types in train_df:\n{train_df.dtypes}")

# Force convert to int64
train_df.iloc[:, -1] = train_df.iloc[:, -1].astype('int64')
test_df.iloc[:, -1] = test_df.iloc[:, -1].astype('int64')


class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe.iloc[:, :-1].values.astype(np.float32)
        self.labels = dataframe.iloc[:, -1].values.astype(np.int64)
        
        self.features = torch.tensor(self.features, dtype=torch.float32)
        self.labels = torch.tensor(self.labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx].unsqueeze(0), self.labels[idx]
        

# Create datasets and data loaders
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

NaN values in train labels: 0
NaN values in test labels: 0
Data types in train_df:
Packet Count                    int64
Total Length                    int64
Avg Interval (s)              float64
Max Interval (s)              float64
Min Interval (s)              float64
Avg Length (bytes)            float64
Max Length (bytes)              int64
Min Length (bytes)              int64
Most Common Length (bytes)      int64
Label                           int64
dtype: object


In [3]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(1, 32, 3, stride=1, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.25),

            nn.Conv1d(32, 64, 3, stride=1, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.25),

            nn.Conv1d(64, 128, 3, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.25),
        )

        self.fc = nn.Sequential(
            nn.Linear(128 * 9, 256),  # 128 channels, 9 features
            nn.ReLU(),
            nn.Linear(256, 4)   # since you have 4 classes (ChatGPT, reddit, wiki, linkedin)
        )

    def forward(self, x):
        x = self.conv(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # your learning rate


num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Testing
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Epoch [1/30], Loss: 1.4081
Epoch [2/30], Loss: 1.3972
Epoch [3/30], Loss: 1.3461
Epoch [4/30], Loss: 1.3495
Epoch [5/30], Loss: 1.3420
Epoch [6/30], Loss: 1.3411
Epoch [7/30], Loss: 1.3300
Epoch [8/30], Loss: 1.3521
Epoch [9/30], Loss: 1.3338
Epoch [10/30], Loss: 1.3075
Epoch [11/30], Loss: 1.3239
Epoch [12/30], Loss: 1.3138
Epoch [13/30], Loss: 1.2763
Epoch [14/30], Loss: 1.3004
Epoch [15/30], Loss: 1.2647
Epoch [16/30], Loss: 1.2762
Epoch [17/30], Loss: 1.2797
Epoch [18/30], Loss: 1.2713
Epoch [19/30], Loss: 1.2239
Epoch [20/30], Loss: 1.2198
Epoch [21/30], Loss: 1.2447
Epoch [22/30], Loss: 1.2829
Epoch [23/30], Loss: 1.2393
Epoch [24/30], Loss: 1.2496
Epoch [25/30], Loss: 1.2241
Epoch [26/30], Loss: 1.2553
Epoch [27/30], Loss: 1.2178
Epoch [28/30], Loss: 1.2216
Epoch [29/30], Loss: 1.2251
Epoch [30/30], Loss: 1.1908
Test Accuracy: 54.55%
