In [77]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder

In [78]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
valid_df = pd.read_csv('validation.csv')

# Create label mapping
label_mapping = {
    'ChatGPT': 0,
    'Reddit': 1,
    'Wikipedia': 2,
    'LinkedIn': 3
}

# Convert label column using mapping
train_df.iloc[:, -1] = train_df.iloc[:, -1].map(label_mapping)
test_df.iloc[:, -1] = test_df.iloc[:, -1].map(label_mapping)
valid_df.iloc[:, -1] = valid_df.iloc[:, -1].map(label_mapping)

# Check for any NaN values in labels 
print(f"NaN values in train labels: {train_df.iloc[:, -1].isna().sum()}")
print(f"NaN values in test labels: {test_df.iloc[:, -1].isna().sum()}")
print(f"NaN values in validation labels: {valid_df.iloc[:, -1].isna().sum()}")


# Convert feature columns to numeric and handle NaN values
for col in train_df.columns[:-1]:  # exclude label column
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
    valid_df[col] = pd.to_numeric(valid_df[col], errors='coerce')

# Fill NaN values with column means instead of dropping rows
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())
valid_df = valid_df.fillna(valid_df.mean())


# Verify data types before conversion to tensors
print(f"Data types in train_df:\n{train_df.dtypes}")

# Force convert to int64
train_df.iloc[:, -1] = train_df.iloc[:, -1].astype('int64')
test_df.iloc[:, -1] = test_df.iloc[:, -1].astype('int64')
valid_df.iloc[:, -1] = valid_df.iloc[:, -1].astype('int64')



class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.features = dataframe.iloc[:, :-1].values.astype(np.float32)
        self.labels = dataframe.iloc[:, -1].values.astype(np.int64)
        
        self.features = torch.tensor(self.features, dtype=torch.float32)
        self.labels = torch.tensor(self.labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx].unsqueeze(0), self.labels[idx]
        

# Create datasets and data loaders
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
valid_dataset = CustomDataset(valid_df)


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)


NaN values in train labels: 0
NaN values in test labels: 0
NaN values in validation labels: 0
Data types in train_df:
Packet Count                    int64
Total Length                    int64
Avg Interval (s)              float64
Max Interval (s)              float64
Min Interval (s)              float64
Avg Length (bytes)            float64
Max Length (bytes)              int64
Min Length (bytes)              int64
Most Common Length (bytes)      int64
Label                           int64
dtype: object


In [95]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(1, 32, 3, stride=1, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Conv1d(32, 64, 3, stride=1, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Conv1d(64, 128, 3, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
        )

        self.fc = nn.Sequential(
            nn.Linear(128 * 9, 256),  # 128 channels, 9 features
            nn.ReLU(),
            nn.Linear(256, 4)   # since you have 4 classes (ChatGPT, reddit, wiki, linkedin)
        )

    def forward(self, x):
        x = self.conv(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # your learning rate


num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Testing
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Epoch [1/50], Loss: 1.4108
Epoch [2/50], Loss: 1.4052
Epoch [3/50], Loss: 1.3946
Epoch [4/50], Loss: 1.3563
Epoch [5/50], Loss: 1.3461
Epoch [6/50], Loss: 1.3320
Epoch [7/50], Loss: 1.3192
Epoch [8/50], Loss: 1.3161
Epoch [9/50], Loss: 1.3017
Epoch [10/50], Loss: 1.2755
Epoch [11/50], Loss: 1.2582
Epoch [12/50], Loss: 1.2816
Epoch [13/50], Loss: 1.2540
Epoch [14/50], Loss: 1.2727
Epoch [15/50], Loss: 1.2634
Epoch [16/50], Loss: 1.2479
Epoch [17/50], Loss: 1.2339
Epoch [18/50], Loss: 1.2142
Epoch [19/50], Loss: 1.2114
Epoch [20/50], Loss: 1.2181
Epoch [21/50], Loss: 1.2531
Epoch [22/50], Loss: 1.1852
Epoch [23/50], Loss: 1.2132
Epoch [24/50], Loss: 1.2262
Epoch [25/50], Loss: 1.2172
Epoch [26/50], Loss: 1.2166
Epoch [27/50], Loss: 1.2254
Epoch [28/50], Loss: 1.2071
Epoch [29/50], Loss: 1.2323
Epoch [30/50], Loss: 1.1943
Epoch [31/50], Loss: 1.2000
Epoch [32/50], Loss: 1.1854
Epoch [33/50], Loss: 1.1917
Epoch [34/50], Loss: 1.1877
Epoch [35/50], Loss: 1.2136
Epoch [36/50], Loss: 1.2058
E

In [96]:
# --- Validation Phase ---
model.eval()
val_correct = 0
val_total = 0
val_loss = 0.0

with torch.no_grad():
    for val_inputs, val_labels in valid_loader:
        val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
        val_outputs = model(val_inputs)
        loss = criterion(val_outputs, val_labels)
        val_loss += loss.item()
        _, val_predicted = torch.max(val_outputs.data, 1)
        val_total += val_labels.size(0)
        val_correct += (val_predicted == val_labels).sum().item()

val_accuracy = 100 * val_correct / val_total
avg_val_loss = val_loss / len(valid_loader)

print(f"Validation Accuracy: {val_accuracy:.2f}%, Validation Loss: {avg_val_loss:.4f}")


Validation Accuracy: 47.06%, Validation Loss: 1.2142


In [97]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Collect predictions and true labels
y_true = []
y_pred = []

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())


In [98]:
label_mapping = {0: 'ChatGPT', 1: 'Reddit', 2: 'Wikipedia', 3: 'LinkedIn'}
website_names = [label_mapping[i] for i in sorted(label_mapping.keys())]

cnn_accuracy = accuracy_score(y_true, y_pred)
cnn_f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
report_cnn_dict = classification_report(y_true, y_pred, target_names=website_names, output_dict=True, zero_division=0)


In [99]:
import plotly.graph_objects as go

models = ['CNN Model']
accuracies = [cnn_accuracy]
f1_macros = [cnn_f1_macro]

fig_metrics = go.Figure(layout=base_layout)

fig_metrics.add_trace(go.Bar(
    x=models,
    y=accuracies,
    name='Accuracy',
    marker_color=colors['bar_lin'],
    text=[f'{acc:.3f}' for acc in accuracies],
    textposition='auto',
    width=0.3
))

fig_metrics.add_trace(go.Bar(
    x=models,
    y=f1_macros,
    name='Macro F1-Score',
    marker_color=colors['bar_log'],
    text=[f'{f1:.3f}' for f1 in f1_macros],
    textposition='auto',
    width=0.3
))

fig_metrics.update_layout(
    title_text='CNN Model Performance',
    xaxis_title='Model',
    yaxis_title='Score',
    yaxis_range=[0, 1.05],
    yaxis_tickformat=".2f",
    barmode='group',
    legend_title_text='Metric'
)

fig_metrics.show()



In [100]:
import numpy as np
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff

# === Step 1: Define label mapping
label_mapping = {
    'ChatGPT': 0,
    'Reddit': 1,
    'Wikipedia': 2,
    'LinkedIn': 3
}

# === Step 2: Reverse mapping for display
index_to_label = {v: k for k, v in label_mapping.items()}

# === Step 3: Numeric label order
numeric_labels = list(index_to_label.keys())  # [0, 1, 2, 3]

# === Step 4: Compute confusion matrix using CNN predictions
cm = confusion_matrix(y_true, y_pred, labels=numeric_labels)

# === Step 5: Label order for axis display
website_names = [index_to_label[i] for i in numeric_labels]

# === Step 6: Define heatmap function
def create_confusion_matrix_heatmap(cm, labels, title, color_scale='Blues'):
    text_values = [[str(val) for val in row] for row in cm]

    fig = ff.create_annotated_heatmap(
        z=cm,
        x=labels,
        y=labels,
        annotation_text=text_values,
        colorscale=color_scale,
        showscale=True
    )

    fig.update_layout(
        title_text=title,
        xaxis_title="Predicted Label",
        yaxis_title="True Label",
        xaxis_side="bottom",
        yaxis_autorange='reversed',
        xaxis=dict(tickangle=-45),
        margin=dict(l=100, r=50, t=100, b=100),
        font=dict(family='Arial', size=12, color='black'),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    fig.update_xaxes(type='category')
    fig.update_yaxes(type='category')
    return fig

# === Step 7: Plot the CNN confusion matrix
fig_cm_cnn = create_confusion_matrix_heatmap(
    cm, website_names, 'Confusion Matrix: CNN Model', color_scale='Blues'
)

fig_cm_cnn.show()
