In [1]:
import numpy as np
import os
import sys
import datasets
import argparse
from typing import Tuple
import transformers
import torch
from torch.utils.data import Dataset
from sklearn import metrics
import matplotlib as plt
import random
from tqdm import tqdm
import pandas as pd
from torch.optim import lr_scheduler
from typing import Callable, Dict, List, Tuple, Union
import csv
from timeit import default_timer as timer
import warnings
warnings.filterwarnings('ignore')
#first data exploration script for datamining phase 








class stylometer_classifier(torch.nn.Module):
    def __init__(self,pretrained_encoder,dimensionality):
        super(stylometer_classifier, self).__init__()
        self.modelBase = pretrained_encoder
        self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16)
        self.activation = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16)




    def forward(self, input_ids, padding_mask):
        output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask)
        hidden_state = output_1[0]
        #Here i take only the cls token representation for further classification
        cls_output = hidden_state[:, 0]
        pooler = self.pre_classifier(cls_output)
        afterActivation = self.activation(pooler)
        pooler_after_act = self.dropout(afterActivation)
        output = self.classifier(pooler_after_act)

        if output>=0.07:
            return {"my_class":"It's a Human!",
                   "prob":output}
        else:
            return {"my_class":"It's an LLM!",
                   "prob":output}


def adapt_model(model:object, dim:int=1024) -> object:
    """
    This function returns the model with a classification head
    """
    newModel = stylometer_classifier(model,dimensionality=dim)

    return newModel






def inizialize(cache_dir = "./Methods/IsThisYou/cache",
                model_name = "Salesforce/codet5p-770m",
                path_checkpoint = "./Methods/IsThisYou/checkpoint.bin"):
    



    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

    #load tokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, token=None)

    #loading model and tokenizer for functional translation
    model = transformers.T5EncoderModel.from_pretrained(model_name).to(DEVICE)

    #adding classification head to the model
    model = adapt_model(model, dim=model.shared.embedding_dim).to(DEVICE)
    model.load_state_dict(torch.load(path_checkpoint,map_location=DEVICE))
    model = model.eval()

    print("Welcome to the Human-AI stylomety tool, insert the code you want to inspect here, \n you can end input with Ctrl+D (linux or mac) or Ctrl+Z and enter for windows, to exit the tool enter Ctl+C:  \n")

    return model, tokenizer


def run(code, model, tokenizer):
    tokenized_input = tokenizer(code)
    out = model(torch.tensor(tokenized_input.input_ids),torch.tensor(tokenized_input.attention_mask))
    print("\n",out["my_class"],"\n")



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-770m")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5p-770m")

: 

In [None]:
model, tokenizer = inizialize()

In [None]:
run(code = "def somma(a, b): return a + b",
    model=model,
    tokenizer=tokenizer)

In [None]:
code_snippet = "def somma(a, b): return a + b"
inputs = tokenizer([code_snippet], return_tensors="pt")
with torch.no_grad():
    out = model(inputs["input_ids"], inputs["attention_mask"])
print(out)  # {'label': 'Human'/'LLM', 'prob': ...}
