In [None]:
!rm -r ~/.cache/huggingface
!unzip -o /kaggle/input/save-models/modules.zip  -d /
!tree ~/.cache/huggingface -lh

In [None]:
%reset -f
import os
path_input = '/kaggle/input/'
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.special import logsumexp
!pip install /kaggle/input/save-models/einops-0.7.0-py3-none-any.whl

In [None]:
tab_train = pd.read_csv(f'{path_input}/llm-detect-ai-generated-text/train_essays.csv')
tab_train['set'] = 'train'
tab_train = tab_train.iloc[:12]
# tab_daigt = pd.read_csv(f'{path_input}/daigt-v2-train-dataset/train_v2_drcat_02.csv')
# tab_daigt = tab_daigt[:5000]
# tab_daigt.rename(columns = {"label":"generated"}, inplace=True)
# tab_daigt['set'] = 'daigt'
# tab_train = pd.concat([tab_train[['text', 'generated', 'set']],
#                       tab_daigt[['text', 'generated', 'set']]]).reset_index(drop=True)
tab_train['generated'].value_counts()

In [None]:
tab_train

In [None]:
import torch
import tqdm
device = "cuda:0" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(device, dtype, flush=True)

In [None]:
from transformers import CodeGenTokenizer, GPT2LMHeadModel, OPTForCausalLM, BertLMHeadModel, AutoModelForCausalLM
print(device, dtype, flush=True)

dict_llm = {
   #'gpt2' : ('/kaggle/input/save-models/models/gpt2-xl', GPT2LMHeadModel, 1024, dict()),
   #'opt'  : ("/kaggle/input/save-models/models/facebook/opt-2.7b", OPTForCausalLM, 2048, dict()),
   # 'bert' : ("/kaggle/input/save-models/models/bert-base-uncased", BertLMHeadModel, 2048, {'is_decoder':True}),
   'phi2' : ("/kaggle/input/save-models/models/microsoft/phi-2", AutoModelForCausalLM, 2048,  {'flash_attn':True, 'flash_rotary':True, 'fused_dense':True, 'trust_remote_code':True}),
}

llm_tokenizer = dict()
llm_model = dict()
for _ in dict_llm:
    llm_tokenizer[_] = CodeGenTokenizer.from_pretrained(dict_llm[_][0], add_bos_token = True)
    if llm_tokenizer[_].pad_token is None:
        llm_tokenizer[_].pad_token = llm_tokenizer[_].eos_token
    llm_model[_] = dict_llm[_][1].from_pretrained(dict_llm[_][0], torch_dtype=dtype, device_map=device, **dict_llm[_][3])
    print(_, flush=True)

print('done', flush=True)

In [None]:
def compute_entropy(input_ids, logits, attention_mask, token_type_ids=None):
    with torch.no_grad():
        logits = torch.log_softmax(logits.float(), dim=-1)
        # scores S, W ,P
        tokens = input_ids[:, 1:]
        attention_mask = attention_mask[:, 1:]
        
        entD = torch.sum(logits * torch.exp(logits), dim=-1)[:, 1:]
        entL = torch.gather(logits[:, :-1, :], dim=-1, index = tokens[:,:,None])[:,:,0]
        
        entD = -torch.where(attention_mask!=0, entD, np.nan)
        entL = -torch.where(attention_mask!=0, entL, np.nan)
        
    return entD, entL


def generate_logprob(llm_model, llm_tokenizer, prompt, max_length=None, add_special_tokens = True, padding=False):
    with torch.no_grad():
        device = next(llm_model.parameters()).device
        tokens = llm_tokenizer(
            prompt, return_tensors="pt",
            max_length=max_length, 
            truncation=max_length is not None,
            truncation_strategy = 'longest_first', 
            abs=add_special_tokens, 
            padding=padding
        )
        tokens = {_: tokens[_].to(device) for _ in tokens}
        logits = llm_model(**tokens).logits
        return compute_entropy(logits=logits, **tokens)


class Batch:
    def __init__(self, iterable, size=1):
        self.iterable = iterable
        self.size = size
        self.len = len(range(0, len(self.iterable), self.size))
        
    def __iter__(self):
        l = len(self.iterable)
        n = self.size
        for ndx in range(0, l, n):
            yield self.iterable[ndx:min(ndx + n, l)]
    
    def __len__(self):
        return self.len

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score

feats_list = ['Dmed_phi2', 'Lmed_phi2', 'Dp05_phi2', 'Lstd_phi2', 'meanchr_phi2',]
# classifier = OneClassSVM(verbose=1,  kernel='rbf', gamma='auto',nu=0.05);
classifier = OneClassSVM(verbose=1,  kernel='rbf', gamma=0.1,nu=0.01);

list_op = {
    'len':  lambda a, axis: np.sum(np.isfinite(a),axis),
    'med': np.nanmedian,
    'max': np.nanmax,
    'mean': np.nanmean,
    'std': np.nanstd,
    'p05' : lambda a, axis: np.nanpercentile(a, 5,axis=axis),
    'p80' : lambda a, axis: np.nanpercentile(a,80,axis=axis),
    'p90' : lambda a, axis: np.nanpercentile(a,90,axis=axis),
    'p95' : lambda a, axis: np.nanpercentile(a,95,axis=axis),
    'p98' : lambda a, axis: np.nanpercentile(a,98,axis=axis),
    #'lse': logsumexp,
}

In [None]:
batch_size = 3
print(batch_size)
#for _ in dict_llm:
#    print(_, flush=True)
#    texts = [' '.join(['hello',]*dict_llm[_][2]) for i in range(batch_size)]
#    vet = generate_logprob(llm_model[_], llm_tokenizer[_], texts, max_length=dict_llm[_][2], padding=True).cpu().numpy()
#print('done', flush=True)

In [None]:
def feature_extraction(tab):
    for index in tqdm.tqdm(tab.index):
        text = tab.loc[index,'text']    
        tab.loc[index,'len_chr'] = len(text)
    for _ in dict_llm:
        print(_, flush=True)
        for index_list in tqdm.tqdm(Batch(tab.index, batch_size)):
            texts = [tab.loc[index,'text'] for index in index_list]
            vetD, vetL = generate_logprob(llm_model[_], llm_tokenizer[_], texts, max_length=dict_llm[_][2], padding=True)
            vetD = vetD.cpu().numpy()
            vetL = vetL.cpu().numpy()
            
            tab.loc[index_list,'meanchr_'+_] = tab.loc[index_list,'len_chr'].values / np.sum(np.isfinite(vetL),-1)
            
            for op in list_op:
                keyD = 'D'+op+'_'+_
                if keyD in feats_list:
                    op_vet = list_op[op](vetD, axis=-1)
                    for index, value in zip(index_list, op_vet):
                        tab.loc[index, keyD] = value
                        
                keyL = 'L'+op+'_'+_
                if keyL in feats_list:
                    op_vet = list_op[op](vetL, axis=-1)
                    for index, value in zip(index_list, op_vet):
                        tab.loc[index, keyL] = value
    return tab

In [None]:
if True:
    tab_train = feature_extraction(tab_train)
    print(tab_train.columns)
    print('done training feature extraction', flush=True)

    train_feats = tab_train[tab_train['generated']==0][feats_list].values
    z_mean = np.mean(train_feats, 0, keepdims=True)
    z_std  = np.maximum(np.std(train_feats, 0, keepdims=True), 1e-4)

    classifier.fit((train_feats - z_mean)/z_std)
    
    print('done training', flush=True)
    
else:
    pass

print(z_mean, z_std)

In [None]:
tab_test = pd.read_csv(f'{path_input}/llm-detect-ai-generated-text/test_essays.csv')
tab_test = feature_extraction(tab_test)
print('done test feature extraction', flush=True)

In [None]:
test_feats = tab_test[feats_list].values
tab_test['generated'] = -1.0*classifier.decision_function((test_feats - z_mean)/z_std)

In [None]:
submission = tab_test[['id','generated']]

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)
pd.read_csv("/kaggle/working/submission.csv")