- We are taking samples from french wikipedia articles as our human corpus, and generating Ai text corpus using vicuna for text generation in french given starting words from a human text sample.

- Now we have human Courpus and AI corpus in French 

- tested RADAR, Roberta on these samples 

- We translate the human and ai corpus to generate translated english human and translated english ai corpus using NMT

- tested RADAR, Roberta on these samples 

### Importing Libraries

In [1]:
import transformers
import torch
from datasets import load_dataset
import torch.nn.functional as F
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


### Setting up Generator

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.1")
model = transformers.AutoModelForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.1")
datawiki_fr  = load_dataset('wikipedia', '20220301.fr')

Please slect appropriate device here.

In [None]:
device = torch.device("cpu")
model.eval()

taking 1000 samples from wiki french articles

In [None]:
human_fr = []
for i in range(1000):
    human_fr.append(datawiki_fr['train'][i]['text'])

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'right'
instruction="Please complete the given text in the french language : "

Gnerating a French Ai text corpus

In [None]:
ai_text_fr =[]
count = 0
for item in human_fr:
    prefix_input_ids=tokenizer([f"{instruction} {item}"],max_length=30,padding='max_length',truncation=True,return_tensors="pt")
    prefix_input_ids={k:v.to("cpu") for k,v in prefix_input_ids.items()}
    outputs = model.generate(
        **prefix_input_ids,
        max_new_tokens = 512,
        do_sample = True,
        temperature = 0.6,
        top_p = 0.9,
        pad_token_id=tokenizer.pad_token_id
    )
    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    AI_texts=[
        item.replace("Please complete the given text in the french language : ","") for item in output_text
    ]
    ai_text_fr.append(AI_texts)
    count+=1
    print(count)

Saving the ai_text_corpus for future use

In [None]:
file_fr = "ai_text_fr2.txt"
with open(file_fr,'a+') as f:
    for text in ai_text_fr:
        f.write('start : '+str(text)+'\n')

### Setting up detector

RADAR

In [None]:
device = "cuda"# example: cuda:0
detector_path_or_id = "TrustSafeAI/RADAR-Vicuna-7B"
detector = transformers.AutoModelForSequenceClassification.from_pretrained(detector_path_or_id)
detector_tokenizer = transformers.AutoTokenizer.from_pretrained(detector_path_or_id)
detector.eval()
detector.to(device)

OpenAi's Roberta 

In [None]:
pipe = pipeline("text-classification", model="openai-community/roberta-large-openai-detector")

Dectecting over Original French texts.

In [None]:
Text_input = human_fr
error = 0
output_probs_list_fr=[]
with torch.no_grad():
  for i in Text_input:
    inputs = tokenizer(i, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k:v.to(device) for k,v in inputs.items()}
    output_probs = F.log_softmax(detector(**inputs).logits,-1)[:,0].exp().tolist()
    output_probs_list_fr.extend(output_probs)

print(len(output_probs_list_fr))

In [None]:
input = human_fr
prediction =[]
for text in input:
    text = text[:512]
    prediction.append(pipe(text))

In [None]:
file_h_pred = "human_fr_human_pred.csv"
with open(file_h_pred,'a+') as f:
    for text in output_probs_list_fr:
        f.write(str(text[0])+'\n')
file_h_roberta_pred = "human_fr_human_roberta_pred.csv"
with open(file_h_roberta_pred,'a+') as f:
    for text in prediction:
        f.write(str(text[0])+'\n')

Detecting over AI Generated French articles

In [None]:
Text_input = ai_text_fr
output_probs_list_fr_ai=[]
with torch.no_grad():
  for i  in Text_input:
    inputs = detector_tokenizer(i, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k:v.to(device) for k,v in inputs.items()}
    output_probs = F.log_softmax(detector(**inputs).logits,-1)[:,0].exp().tolist()
    output_probs_list_fr_ai.append(output_probs)

print(len(output_probs_list_fr_ai))

In [None]:
input = ai_text_fr
prediction =[]
for text in input:
    text = text[:512]
    prediction.append(pipe(text))

In [None]:
file_ai_pred = "human_fr_ai_pred.csv"
with open(file_ai_pred,'a+') as f:
    for text in output_probs_list_fr_ai:
        f.write(str(text[0])+'\n')
file_ai_roberta_pred = "human_fr_ai_roberta_pred.csv"
with open(file_ai_roberta_pred,'a+') as f:
    for text in prediction:
        f.write(str(text[0])+'\n')

### Performing Translations to detect over English articles

I have currently used Opus-MT it is used as base model in easyNMT paper link : https://aclanthology.org/2020.eamt-1.61/

In [None]:
translator_tokenizer = transformers.AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
translator = transformers.AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

In [None]:
human_fr_tr=[]
for text in human_fr:
    inp = text
    inp = inp[:512]
    input_ids = tokenizer(inp, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
    human_fr_tr.append(tokenizer.batch_decode(outputs, skip_special_tokens=True))


In [None]:
ai_fr_tr = []
for text in ai_text_fr:
    inp = text
    inp = inp[:512]
    input_ids = tokenizer(inp, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
    ai_fr_tr.append(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    

### Detection over Non-english Texts after translations.

In [None]:
Text_input = human_fr_tr
output_probs_list_fr_tr_human=[]
with torch.no_grad():
  for i  in Text_input:
    inputs = detector_tokenizer(i, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k:v.to(device) for k,v in inputs.items()}
    output_probs = F.log_softmax(detector(**inputs).logits,-1)[:,0].exp().tolist()
    output_probs_list_fr_tr_human.append(output_probs)
print(len(output_probs_list_fr_tr_human))

In [None]:
input = human_fr_tr
prediction =[]
for text in input:
    text = text[:512]
    prediction.append(pipe(text))

In [None]:
file_h_tr_pred = "human_fr_human_tr_pred.csv"
with open(file_h_pred,'a+') as f:
    for text in output_probs_list_fr_tr_human:
        f.write(str(text[0])+'\n')
file_h_roberta_tr_pred = "human_fr_human_roberta_tr_pred.csv"
with open(file_h_roberta_tr_pred,'a+') as f:
    for text in prediction:
        f.write(str(text[0])+'\n')

In [None]:
Text_input = ai_fr_tr
output_probs_list_fr_tr_ai=[]
with torch.no_grad():
  for i  in Text_input:
    inputs = detector_tokenizer(i, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k:v.to(device) for k,v in inputs.items()}
    output_probs = F.log_softmax(detector(**inputs).logits,-1)[:,0].exp().tolist()
    output_probs_list_fr_tr_ai.append(output_probs)
print(len(output_probs_list_fr_tr_ai))

In [None]:
input = ai_fr_tr
prediction =[]
for text in input:
    text = text[:512]
    prediction.append(pipe(text))

In [None]:
file_ai_tr_pred = "human_fr_ai_tr_pred.csv"
with open(file_ai_pred,'a+') as f:
    for text in output_probs_list_fr_tr_ai:
        f.write(str(text[0])+'\n')

file_ai_roberta_tr_pred = "human_fr_ai_roberta_tr_pred.csv"
with open(file_ai_roberta_tr_pred,'a+') as f:
    for text in prediction:
        f.write(str(text[0])+'\n')