In [1]:
import pandas as pd 

df = pd.read_csv('AI_Human.csv')

df = df.rename(columns={'generated': 'label'})

df.head()

Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [2]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),   # shape: [max_len]
            'attention_mask': encoding['attention_mask'].squeeze(),  # shape: [max_len]
            'label': torch.tensor(label, dtype=torch.float)
        }

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
subset = torch.utils.data.Subset(train_dataset, range(1000))
train_loader = DataLoader(subset, batch_size=32, shuffle=False)
val_subset = torch.utils.data.Subset(val_dataset, range(500))  # use first 500 samples
val_loader = DataLoader(val_subset, batch_size=128)


In [24]:
import torch.nn as nn

class BertClassifier(nn.Module):
    def __init__(self, bert_model, hidden_size=768):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(hidden_size, 1)  # output 1 for binary classification
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [batch_size, hidden_size]
        logits = self.classifier(pooled_output)
        return self.sigmoid(logits)


In [25]:
import torch_directml

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dml = torch_directml.device()

model = BertClassifier(bert).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()  # Binary Cross-Entropy for boolean labels


In [7]:
from tqdm import tqdm  # for progress bars

def train_model(model, dataloader, optimizer, criterion, device='cpu', epochs=3):
    model.train()  # set to training mode
    
    for epoch in range(epochs):
        total_loss = 0

        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            # 1️⃣ Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device).float()  # shape [batch_size]

            # 2️⃣ Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs = outputs.squeeze()  # remove [batch_size, 1] -> [batch_size]

            # 3️⃣ Compute loss
            loss = criterion(outputs, labels)

            # 4️⃣ Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} | Average Loss: {avg_loss:.4f}")

In [8]:
train_model(model, train_loader, optimizer, criterion)

Epoch 1/3: 100%|██████████| 32/32 [07:14<00:00, 13.59s/it]


Epoch 1 | Average Loss: 0.4678


Epoch 2/3: 100%|██████████| 32/32 [07:10<00:00, 13.46s/it]


Epoch 2 | Average Loss: 0.1693


Epoch 3/3: 100%|██████████| 32/32 [07:11<00:00, 13.49s/it]

Epoch 3 | Average Loss: 0.0660





In [26]:
import torch
from sklearn.metrics import accuracy_score

# Make sure model is in evaluation mode
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # If your model returns logits
        preds = torch.argmax(outputs, dim=1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute accuracy
acc = accuracy_score(all_labels, all_preds)
print(f'Validation Accuracy: {acc:.4f}')


Validation Accuracy: 0.6120


In [13]:
def predict_text(text, model, tokenizer, max_len=256, device='cpu'): #Change device param if you want either CPU or CUDA (Nvidia GPU)
    # Move model to device
    model.to(device)

    # Tokenize the text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        output = model(input_ids, attention_mask)
        prediction = (output.item() > 0.5)
        print(f"Model predicts that this text is {output.item():.2%} AI generated")
    return "AI" if prediction else "Human"

In [21]:
text = """

The J.M. Smucker Co. is suing Trader Joe’s, alleging the grocery chain’s new frozen peanut butter and jelly sandwiches are too similar to Smucker’s Uncrustables in their design and packaging.

In the lawsuit, which was filed Monday in federal court in Ohio, Smucker said the round, crustless sandwiches Trader Joe’s sells have the same pie-like crimp markings on their edges that Uncrustables do. Smucker said the design violates its trademarks.

Smucker also asserted that the boxes Trader Joe’s PB&J sandwiches come in violate the Orrville, Ohio-based company’s trademarks because they are the same blue color it uses for the lettering on “Uncrustables” packages.

Trader Joe’s boxes also show a sandwich with a bite mark taken out of it, which is similar to the Uncrustables design, Smucker said.
"""

result = predict_text(text, model, tokenizer)  # or 'cpu'
print(result)  # Output: "AI" or "Human"

Model predicts that this text is 93.57% AI generated
AI


In [11]:
def validate_model(model, validation_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient calculations
        for inputs, labels in validation_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Example for classification accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

    avg_loss = total_loss / len(validation_loader)
    accuracy = (correct_predictions / total_samples) * 100

    print(f"Validation Loss: {avg_loss:.4f}, Validation Accuracy: {accuracy:.2f}%")
    model.train()  # Set model back to training mode if needed
    return avg_loss, accuracy

In [12]:
validate_model(model, validation_loader, criterion, 'cpu')

NameError: name 'validation_loader' is not defined