In [None]:
! python3 -m pip install pandas torch pytorch_transformers transformers scikit-learn nltk accelerate --upgrade

import re
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from torch import nn, optim
from transformers import RobertaTokenizer, RobertaModel, Trainer, TrainingArguments
from torch.nn.utils.rnn import pad_sequence
import numpy as np

In [None]:
device = torch.device("cpu")

# clean of special characters
# lowercase it all
# remove special character
# then stem the words
# finally tokenize the text


# Load data
friendspersona_full = pd.read_csv('friends-personality.csv')

# Text cleaning function
def clean_text(X_text):
    clean = re.sub(r'<.*?>', '', X_text)  # Remove HTML tags
    clean = re.sub(r'\s+', ' ', clean)  # Replace multiple spaces with a single space
    return clean.strip()

friendspersona_full["text_lower"] = friendspersona_full["text"].str.lower()

replaceBArrows = []
for sentence in friendspersona_full["text_lower"]:
    sentenceNew = sentence.replace('<b>', '').replace('</b>', '').replace('<br>', '')
    replaceBArrows.append(sentenceNew)

friendspersona_full["text_lower"] = replaceBArrows

# Remove special characters
no_special_char = [re.sub('[^A-Za-z0-9]+', ' ', sentence) for sentence in friendspersona_full["text_lower"]]
friendspersona_full["no_special_char"] = no_special_char

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

friendspersona_full["text_stemmed"] = friendspersona_full["no_special_char"].apply(lambda text: stem_words(text))

# Initialize RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


def tokenize_text(text, max_length=512):
    return tokenizer(text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')

friendspersona_full['clean_text'] = friendspersona_full['text_stemmed'].apply(clean_text)
friendspersona_full['tokenized_text'] = friendspersona_full['clean_text'].apply(lambda x: tokenize_text(x))

X_text = friendspersona_full['tokenized_text'].tolist()
print(f"X_text sample: {X_text[:5]}")

onehot_encoder = OneHotEncoder()
X_character = onehot_encoder.fit_transform(friendspersona_full[['character']]).toarray()

y = friendspersona_full[['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']].values


In [None]:

assert len(X_text) == len(X_character) == len(y), "Mismatch in data lengths"

# Train-test split
X_text_train, X_text_test, X_character_train, X_character_test, y_train, y_test = train_test_split(
    X_text, X_character, y, test_size=0.2, random_state=42
)

class PersonalityDataset(Dataset):
    def __init__(self, text_data, character_data, labels):
        self.text_data = text_data
        self.character_data = character_data
        self.labels = labels

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        text = self.text_data[idx]
        item = {
            'input_ids': text['input_ids'].squeeze(0).to(device),
            'attention_mask': text['attention_mask'].squeeze(0).to(device),
            'character_input': torch.tensor(self.character_data[idx], dtype=torch.float).to(device),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float).to(device),
        }
        return item
    
    
# Create datasets
train_dataset = PersonalityDataset(X_text_train, X_character_train, y_train)
test_dataset = PersonalityDataset(X_text_test, X_character_test, y_test)

# Print a sample from the dataset
print("Sample from train_dataset:", train_dataset[0])
print("Sample from test_dataset:", test_dataset[0])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:

# Define the model
class CustomRobertaModel(nn.Module):
    def __init__(self, num_labels=5, character_input_dim=106):
        super(CustomRobertaModel, self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.5)
        self.character_dense = nn.Linear(character_input_dim, 128)
        self.dense_1 = nn.Linear(768 + 128, 256)
        self.dropout_1 = nn.Dropout(0.5)
        self.dense_2 = nn.Linear(256, 128)
        self.dropout_2 = nn.Dropout(0.5)
        self.output = nn.Linear(128, 5)
        self.loss_fn = nn.MSELoss()  # Add this line
        self.to(device)


    def forward(self, input_ids, character_input, attention_mask, labels = None):
        roberta_output = self.roberta(input_ids, attention_mask=attention_mask)[1]
        character_output = torch.relu(self.character_dense(character_input))
        combined = torch.cat((roberta_output, character_output), dim=1)
        x = torch.relu(self.dense_1(combined))
        x = self.dropout_1(x)
        x = torch.relu(self.dense_2(x))
        x = self.dropout_2(x)
        logits = self.output(x)
        loss = None
        if labels is not None:
            labels = labels.to(device)
            loss = self.loss_fn(logits, labels)


        return (loss, logits) if loss is not None else logits
# Initialize the model
roberta_model = CustomRobertaModel().to(device)


In [None]:
# Define the data collator function

# all this does is pad the data, and split it into batches. batches is the 
# input_ids is the text_data

# i think we ran into issues when we tried 

# Define the data collator function
# inside this data collator features parameter, contains the features, and it has
# a dictionary with these 4 values

#where is the dictionary defined?           
# To debug this issue, we need to ensure that the dataset correctly returns dictionaries  
# with the expected keys ('input_ids', 'character_input', 'attention_mask', and 'labels').
# Let's go through the process step-by-step. 

# if dictionary is defined in __getitem__, i have a hunch that we don't actually
# use the __getitem__ function for some reason

# okay so there's no issue with the __getitem___ function...
# but there is a problem with what we pass into features



# apparently features is a list of dictionaries, and each dictionary correspondes to a 
# sample from dataset, returned by __getitem__


# explanation: train dataset has 3 sample datapoints
# dataloader splits the train dataset into batches of size 2
# dataloader sends a batch over to data_collator in the form of features parameter

def data_collator(features):
    for i, feature in enumerate(features):
        if 'input_ids' not in feature:
            print(f"Missing 'input_ids' in feature {i}: {feature}")
        if 'attention_mask' not in feature:
            print(f"Missing 'attention_mask' in feature {i}: {feature}")
        if 'character_input' not in feature:
            print(f"Missing 'character_input' in feature {i}: {feature}")
        if 'labels' not in feature:
            print(f"Missing 'labels' in feature {i}: {feature}")

    input_ids = [f['input_ids'] for f in features]
    attention_mask = [f['attention_mask'] for f in features]
    character_input = [f['character_input'] for f in features]
    labels = [f['labels'] for f in features]

    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
    padded_attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0).to(device)

    character_input = torch.stack(character_input).to(device)
    labels = torch.stack(labels).to(device)

    batched_data = {
        'input_ids': padded_input_ids,
        'character_input': character_input,
        'attention_mask': padded_attention_mask,
        'labels': labels
    }

    return batched_data



# When the model receives inputs that include the labels, it's supposed to produce a 
# tuple of (loss, predictions), where the loss is a scalar. The trainer then uses the l
# oss to calculate the gradients. In this case (or at least in my case when I get a 
# similar error) the trainer appears to be trying to use the predictions not the
# loss to calculate the gradient. This appears to be because the model is not receiving 
# the 'labels' as input and so is only producing a one tuple of (predictions).
# You should be able to fix it by passing a value for "labels" in your collator. 
# See for example transformers.DataCollatorForLanguageModeling.





# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    remove_unused_columns=False,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    no_cuda=True,
)

# Initialize Trainer
trainer = Trainer(
    # remove_unused_columns=False,
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=None,
    # device=device  # Add this line
)
# i will change compute_metrics later if I want to track specific metrics

# Train the model
trainer.train()



# okay so the problem is definitely 100% what dataset we are passing into data_collator
# it is missing some keys in the dictionary and I have no idea why
# hypothesis: some features are missing keys, some aren't?
# add a console.err debug that outputs the current features.
# or just output the current features everytime
# apparently [0] already has problems

# from transformers import DataCollatorForTokenClassification
# data_collator = DataCollatorForTokenClassification(tokenizer)



# ohhh, features from data_collator is a batch!


# # Print sample from the dataset to debug
# sample = train_dataset[0]
# print("Sample from dataset:", sample)

# Train the model







# either it is a problem with grad can be implicitly created https://github.com/huggingface/transformers/issues/6749
# or it is an issue with keeping labels in the datakey dictionary, so we should just remove the labels key because it's handled separately???? in train????
    # tried just removing the labels key everywhere, doesn't work

# need to figure this part out
# apparently it's because of the model. Bascially, my loss caluclation needs to be in my model
# it's not returning a loss value, necessary for Trainer to compute gradients

# you need to return a loss value????


In [None]:
PATH = "roberta_model.pt"

# Save
trainer.save_model(PATH)

In [None]:
! python3 -m pip install safetensors

from transformers import Trainer, TrainingArguments
from safetensors.torch import load_file

In [None]:
PATH = "roberta_model"  # This is the directory containing your saved model

state_dict = load_file(f"{PATH}/model.safetensors")

roberta_model.load_state_dict(state_dict)

roberta_model.eval()
roberta_model.to(device)


# Load training arguments
training_args = torch.load(f"{PATH}/training_args.bin")

# Recreate the Trainer with the loaded model
trainer = Trainer(
    model=roberta_model,
    args=training_args,
    data_collator=data_collator,  # Use the same data_collator you used for training
    eval_dataset=test_dataset,
)

trainer.evaluate()


In [None]:
# character_encoded = onehot_encoder.transform([["okay"]]).toarray()

# # [0. 0. 0. 0. 0. 0.]

def prepare_input(text, character):
    # Clean and tokenize the text
    cleaned_text = clean_text(text)
    
    cleaned_text = cleaned_text.lower()
    no_special_char = [re.sub('[^A-Za-z0-9]+', ' ', cleaned_text)]
    no_marks = no_special_char[0].replace('<b>', '').replace('</b>', '').replace('<br>', '')
    print(no_marks)
    tokenized = tokenize_text(no_marks)
    print("tokenized")
    print(tokenized)
    # One-hot encode the character
    character_encoded = onehot_encoder.transform([[character]]).toarray()
    
    # Create a dataset with a single item
    dataset = PersonalityDataset([tokenized], [character_encoded[0]], [[0, 0, 0, 0, 0]])  # Dummy labels
    
    return dataset

def predict(trainer, dataset):
    predictions = trainer.predict(dataset)
    return predictions.predictions[0]  # Return the first (and only) prediction

# Example usage
new_text = "<b>s01_e01_c01(1) for Joey Tribbiani</b><br><br><b>Ross Geller</b>: No!! Okay?! Why does everyone keep fixating on that? She didn't know, how should I know?<br><br><b>Chandler Bing</b>: Sometimes I wish I was a lesbian... (They all stare at him.) Did I say that out loud?<br><br><b>Ross Geller</b>: I told mom and dad last night, they seemed to take it pretty well.<br><br><b>Monica Geller</b>: Oh really, so that hysterical phone call I got from a woman at sobbing 3:00 A.M., 'I'll never have grandchildren, I'll never have grandchildren.' was what? A wrong number?<br><br><b>Ross Geller</b>: Sorry.<br><br><b>Joey Tribbiani</b>: Alright Ross, look. You're feeling a lot of pain right now. You're angry. You're hurting. Can I tell you what the answer is?<br><br>(Ross gestures his consent.)<br><br><b>Joey Tribbiani</b>: Strip joint! C'mon, you're single! Have some hormones!<br><br><b>Ross Geller</b>: I don't want to be single, okay? I just... I just- I just wanna be married again!<br><br>(Rachel enters in a wet wedding dress and starts to search the room.)<br><br>"
new_character = "Joey Tribbiani"
prediction_dataset = prepare_input(new_text, new_character)



# Make prediction
predictions = predict(trainer, prediction_dataset)


# Print the predictions
print("Predictions:")
print("cAGR:", predictions[0])
print("cCON:", predictions[1])
print("cEXT:", predictions[2])
print("cOPN:", predictions[3])
print("cNEU:", predictions[4])

# the real output of [0] is supposed to be 1,1,0,0,1

In [None]:
print(friendspersona_full['tokenized_text'][0])