In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
# Config
DATASET_FOLDER_PATH = "/content/drive/Shareddrives/CS152 Project/dataset/"
DATASET_TREE_PATH = "/content/drive/Shareddrives/CS152 Project/dataset/all/"

In [None]:
train_df = pd.read_csv(f"{DATASET_FOLDER_PATH}train_all_models_fixed.csv")
test_df = pd.read_csv(f"{DATASET_FOLDER_PATH}test_all_models.csv")
train_df.head()

In [None]:
features = train_df[["dima806_score", "wvolf_score", "Gemini Classification"]].values


targets = train_df["is_ai"].values

# Convert to tensors
X_tensor = torch.tensor(features, dtype=torch.float32)
y_tensor = torch.tensor(targets.reshape(-1, 1), dtype=torch.float32)

# Split into train and validation sets
dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)

# MLP - we tested multiple versions of this
model = nn.Sequential(
    nn.Linear(3, 4),
    nn.ReLU(),
    nn.Linear(4, 1),
    nn.Sigmoid()
)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

epochs = 100
train_losses = []
val_losses = []

# Train loop
for epoch in tqdm(range(epochs), desc="Epochs"):
    model.train()
    running_loss = 0.0

    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_X.size(0)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for val_X, val_y in val_loader:
            val_outputs = model(val_X)
            v_loss = criterion(val_outputs, val_y)
            val_loss += v_loss.item() * val_X.size(0)

    train_epoch_loss = running_loss / len(train_loader.dataset)
    val_epoch_loss = val_loss / len(val_loader.dataset)

    train_losses.append(train_epoch_loss)
    val_losses.append(val_epoch_loss)

    print(f"Epoch {epoch+1}: Train Loss = {train_epoch_loss:.4f}, Val Loss = {val_epoch_loss:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# add predictions to test dataframe
test_features = test_df[["dima806_score", "wvolf_score", "Gemini Classification"]].values
test_tensor = torch.tensor(test_features, dtype=torch.float32)
test_dataset = TensorDataset(test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64)

test_predictions = []
model.eval()
with torch.no_grad():
    for test_X, in test_loader:
        test_outputs = model(test_X)
        test_predictions.extend(test_outputs.numpy().flatten())

test_df["predictions"] = test_predictions

In [None]:
# Add majority vote column, between wvolf, dima806 and Gemini Classification
majority_vote = []
for index, row in test_df.iterrows():
  predictions = [row["wvolf_score"], row["dima806_score"], row["Gemini Classification"]]
  predictions = [1 if prediction > 0.5 else 0 for prediction in predictions]
  if sum(predictions) >= 2:
    majority_vote.append(1)
  else:
    majority_vote.append(0)

test_df["majority_vote"] = majority_vote

In [None]:
# Count number of zeros and 1s in predictions
zeros = 0
ones = 0
for prediction in test_df["majority_vote"]:
  if prediction < 0.5:
    zeros += 1
  else:
    ones += 1
print(f"Zeros: {zeros}")
print(f"Ones: {ones}")

# Correlation coefficients
from scipy.stats import pearsonr
print("Correlation coefficients:")
print("wvolf model:", pearsonr(test_df["wvolf_score"], test_df["is_ai"]).statistic)
print("dima806 model:", pearsonr(test_df["dima806_score"], test_df["is_ai"]).statistic)
print("gemini model:", pearsonr(test_df["Gemini Classification"], test_df["is_ai"]).statistic)
print("majority vote:", pearsonr(test_df["majority_vote"], test_df["is_ai"]).statistic)
# print("MLP model:", pearsonr(test_df["predictions"], test_df["is_ai"]).statistic)

In [None]:
# THRESHOLD = 0.009  # BEST FOR DIMA806
# THRESHOLD = 0.08  # Best for wvolf


def eval_model(df, column_name, THRESHOLD):
  # Calculate eval metrics
  true_positives = 0
  false_positives = 0
  true_negatives = 0
  false_negatives = 0
  for index, row in df.iterrows():
    if row["is_ai"] == 1 and row[column_name] > THRESHOLD:
      true_positives += 1
    elif row["is_ai"] == 1 and row[column_name] <= THRESHOLD:
      false_negatives += 1
    elif row["is_ai"] == 0 and row[column_name] > THRESHOLD:
      false_positives += 1
    elif row["is_ai"] == 0 and row[column_name] <= THRESHOLD:
      true_negatives += 1

  # assert true_positives + false_positives + true_negatives + false_negatives == len(df)

  print(f"Model: {column_name}")
  print(f"Dataset size {len(df)}")
  print()
  print(f"True Positives: {true_positives}")
  print(f"False Positives: {false_positives}")
  print(f"True Negatives: {true_negatives}")
  print(f"False Negatives: {false_negatives}")
  print()

  accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)

  if true_positives + false_positives == 0:
    precision = 0
  else:

    precision = true_positives / (true_positives + false_positives)
  if true_positives + false_negatives == 0:
    recall = 0
  else:
    recall = true_positives / (true_positives + false_negatives)

  if precision + recall == 0:
    f1_score = 0
  else:
    f1_score = 2 * (precision * recall) / (precision + recall)

  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 Score: {f1_score}")
  print()
  print()

In [None]:
eval_model(test_df, "predictions", 0.3)
eval_model(test_df, "Gemini Classification", 0.2)
test_df