In [None]:

import torch
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta")
model = AutoModelForSequenceClassification.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta")

# 输入处理函数
def preprocess_text(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    return inputs


# 预测函数import torch
import os
import json
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

# 1. 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta")
model = AutoModelForSequenceClassification.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta", num_labels=2)

# 2. 构建数据集
class TextDetectDataset(Dataset):
    def __init__(self, json_path, tokenizer, max_length=256):
        self.samples = []
        with open(json_path, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line)
                label = 1 if item['label'] == 1 else 0
                self.samples.append({'text': item['text'], 'label': label})
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text = self.samples[idx]['text']
        label = self.samples[idx]['label']
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# 3. 加载数据集并划分
dataset = TextDetectDataset('../datasets/train.json', tokenizer)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# 4. 训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=20,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# 5. F1评价指标
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    f1 = f1_score(labels, preds)
    return {"f1": f1}

# 6. 训练与评估
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_result = trainer.evaluate()
print("Test set evaluation:", eval_result)

# 7. 推理函数
def detect_ai_generated(text, threshold=0.5):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        padding="max_length"
    )
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    ai_prob = probabilities[:, 1].item()  # 1类代表人类文本
    return {
        "is_human": ai_prob > threshold,
        "probability": round(ai_prob, 4),
        "threshold_used": threshold
    }

# 8. 示例测试
test_cases = [
    "The quick brown fox jumps over the lazy dog",  # 人类文本
    "Artificial intelligence systems are designed to perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation."
]

for text in test_cases:
    result = detect_ai_generated(text)
    print(f"Text: {text[:50]}...")
    print(f"Human Probability: {result['probability']}")
    print(f"Is Human: {result['is_human']}\n")
def detect_ai_generated(text, threshold=0.5):
    inputs = preprocess_text(text)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    ai_prob = probabilities[:, 1].item()  # 1类代表AI生成

    return {
        "is_ai_generated": ai_prob > threshold,
        "probability":     round(ai_prob, 4),
        "threshold_used":  threshold
    }


# 示例测试
test_cases = [
    "The quick brown fox jumps over the lazy dog",  # 人类文本
    "Artificial intelligence systems are designed to perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation."
    # AI生成文本
]

for text in test_cases:
    result = detect_ai_generated(text)
    print(f"Text: {text[:50]}...")
    print(f"AI Probability: {result['probability']}")
    print(f"AI Generated: {result['is_ai_generated']}\n")

  from .autonotebook import tqdm as notebook_tqdm
2025-06-15 10:52:29.043789: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-15 10:52:29.048244: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-15 10:52:29.057178: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749955949.072294    6227 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749955949.076688    6227 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory 