In [28]:
import pandas as pd 

df = pd.read_csv('AI_Human.csv')

df = df.rename(columns={'generated': 'label'})

df.head()

Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [29]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')



In [30]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),   # shape: [max_len]
            'attention_mask': encoding['attention_mask'].squeeze(),  # shape: [max_len]
            'label': torch.tensor(label, dtype=torch.float)
        }

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.15,
    random_state=42
)

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [32]:
import torch.nn as nn

class BertClassifier(nn.Module):
    def __init__(self, bert_model, hidden_size=768):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(hidden_size, 1)  # output 1 for binary classification
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [batch_size, hidden_size]
        logits = self.classifier(pooled_output)
        return self.sigmoid(logits)


In [33]:
import torch_directml

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dml = torch_directml.device()

model = BertClassifier(bert).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()  # Binary Cross-Entropy for boolean labels


In [34]:
def predict_text(text, model, tokenizer, max_len=256, device=dml): #Change device param if you want either CPU or CUDA (Nvidia GPU)
    # Move model to device
    model.to(device)

    # Tokenize the text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        output = model(input_ids, attention_mask)
        prediction = (output.item() > 0.5)
        print(f"Model predicts that this text is {output.item():.2%} AI generated")
    return "AI" if prediction else "Human"

In [36]:
text = """
Founded in 1999, Gazeta (literally “gazette”) was originally a fairly liberal news outlet headed by journalist Anton Nosik (1966-2017) and soon acquired by Mikhail Khodorkovsky’s Yukos. However, since the mid-2000s, it has been taken over and passed from one billionaire oligarch to another: Alisher Usmanov acquired it in 2006 and sold it in 2012 to Alexander Mamut, who sold it in 2020 to Sberbank, Russia’s largest bank, headed by president and chairman German Gref and former Minister of Finance Anton Siluanov. Mamut remains on the board of directors of Rambler Media Group.
"""

result = predict_text(text, model, tokenizer)  # or 'cpu'
print(result)  # Output: "AI" or "Human"

Model predicts that this text is 60.49% AI generated
AI
