In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertModel, DistilBertTokenizer
import torch


In [14]:
df_complete= pd.read_csv('dataset.csv',index_col=0)

In [15]:
# Crear un diccionario de mapeo
mapping = {'Human-Generated-Text': 0, 'AI-Generated-Text': 1}

In [16]:
#Reemplazar las clases en la columna 'class' con el mapeo

df_complete['class'] = df_complete['class'].map(mapping)

In [30]:
df_complete

Unnamed: 0,text,class
0,Sekhukhune I (Matsebe; circa 1814 – 13 Septemb...,1
1,Mount Washington is a peak in the White Mount...,1
2,Acer hillsi is an extinct maple species that w...,1
3,Derrick George Sherwin (16 April 1936 – 17 Oct...,0
4,The Windows shell is the graphical user interf...,0
...,...,...
299995,OutServe Magazine was a bi-monthly digital and...,1
299996,Eastern Armenia ( Arevelyan Hayastan) is the e...,0
299997,Infinity Group is a private equity fund manage...,1
299998,Kattinakere (ಕಟ್ಟಿನಕೆರೆ) also called B Sagadde...,1


In [31]:
# Suponiendo que tu dataframe se llama df
# 1. Filtrar por cada clase
ai_data = df_complete[df_complete['class'] == 1]
human_data = df_complete[df_complete['class'] == 0]

In [32]:
ai_data

Unnamed: 0,text,class
0,Sekhukhune I (Matsebe; circa 1814 – 13 Septemb...,1
1,Mount Washington is a peak in the White Mount...,1
2,Acer hillsi is an extinct maple species that w...,1
5,"The Hetoimasia, Etimasia (Greek ἑτοιμασία, ""pr...",1
7,Sister Elizabeth Kenny (20 September 1880 – 31...,1
...,...,...
299994,"The Sentinel-class cutter, also known as Fast ...",1
299995,OutServe Magazine was a bi-monthly digital and...,1
299997,Infinity Group is a private equity fund manage...,1
299998,Kattinakere (ಕಟ್ಟಿನಕೆರೆ) also called B Sagadde...,1


In [33]:
# 2. Seleccionar 500 filas aleatorias de cada clase
ai_sample = ai_data.sample(n=500, random_state=42)  # 500 filas de 'ai'
human_sample = human_data.sample(n=500, random_state=42)  # 500 filas de 'human'

In [34]:
# 3. Concatenar las dos muestras para obtener un total de 1000 filas
balanced_sample = pd.concat([ai_sample, human_sample])


In [35]:

# 4. (Opcional) Barajar las filas para mezclar las clases
balanced_sample = balanced_sample.sample(frac=1, random_state=42).reset_index(drop=True)

In [36]:


# Ver el dataframe resultante
print(balanced_sample)


                                                  text  class
0    Duck Creek Aqueduct, also known as the Metamor...      0
1    Jonathan Shapiro is a writer, producer, attorn...      0
2    The Made V.I.P Tour is the first Chinese fan m...      0
3    Johann Geusendam (24 February 1886 – 6 April 1...      0
4    The discography of American ambient musician B...      1
..                                                 ...    ...
995  Mitică  is a fictional character who appears i...      1
996  The 2019 MSBL season was the 31st season of th...      1
997  Viral synapse (or virological synapse) is a  m...      0
998  Werk-en-rust, also Werken-Rust, (Work and Rest...      1
999  David Edward Gatten (Born February 11, 1971) i...      1

[1000 rows x 2 columns]


In [5]:
# Inicializar el tokenizer y el modelo
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


In [38]:
# Función para obtener embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Aplicar la función a la columna de texto
df_complete['Embeddings'] = df_complete['text'].apply(get_embeddings)

# Ver el DataFrame resultante


: 

In [None]:
# Convert the column embedding into columns with names emb_1, emb_2, ..., emb_768
df = pd.concat([df_complete.drop(['Embeddings'], axis=1), df_complete['Embeddings'].apply(lambda x: pd.Series(x.flatten()))], axis=1)


In [5]:
import torch
import tiktoken
from transformers import BartTokenizer, AlbertForSequenceClassification

In [6]:

tokenizer = BartTokenizer.from_pretrained("textattack/albert-base-v2-imdb")
model = AlbertForSequenceClassification.from_pretrained("textattack/albert-base-v2-imdb")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BartTokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [None]:



inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

# To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
num_labels = len(model.config.id2label)
model = AlbertForSequenceClassification.from_pretrained("textattack/albert-base-v2-imdb", num_labels=num_labels)

labels = torch.tensor([1])
loss = model(**inputs, labels=labels).loss
round(loss.item(), 2)