<a href="https://colab.research.google.com/github/ElinDillen/activism-and-politics-in-Oscar-Speeches/blob/main/Models_to_test_out.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Roberta and sentence-transformers

In [None]:
#this code embeds the text automatically and uses roberta-base or other models to analyze text+our linguistic features
#simplistic code to test things out

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset

# Assuming you have text and features
# df = pd.read_csv("your_data.csv")

# Choose a pre-trained model
model_name = "roberta-base" # Or roberta-base or sentence-transformers/all-MiniLM-L6-v2
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a dataset that combines text and features
class CombinedFeaturesDataset(Dataset):
    def __init__(self, texts, linguistic_features, labels, tokenizer, max_length=128):
        self.texts = texts
        self.linguistic_features = linguistic_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize text
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Convert to appropriate format and remove batch dimension
        item = {key: val.squeeze(0) for key, val in encoding.items()}

        # Add linguistic features
        item["linguistic_features"] = torch.tensor(self.linguistic_features[idx], dtype=torch.float)

        # Add label
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

# Split your data
texts = df['text'].values
features = df[['verb_count', 'hedge_count', 'article_count', 'sentiment_score']].values
labels = df['personality_label'].values

X_train_texts, X_test_texts, X_train_features, X_test_features, y_train, y_test = train_test_split(
    texts, features, labels, test_size=0.2
)

# Create datasets
train_dataset = CombinedFeaturesDataset(X_train_texts, X_train_features, y_train, tokenizer)
test_dataset = CombinedFeaturesDataset(X_test_texts, X_test_features, y_test, tokenizer)

# Custom model that combines text embeddings with linguistic features
class CombinedPersonalityModel(torch.nn.Module):
    def __init__(self, model_name, linguistic_feature_size, num_labels):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(model_name)
        self.text_encoder_dim = self.text_encoder.config.hidden_size

        # Fusion layer
        self.fusion = torch.nn.Sequential(
            torch.nn.Linear(self.text_encoder_dim + linguistic_feature_size, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, linguistic_features, labels=None):
        # Get text embeddings
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = text_outputs.last_hidden_state[:, 0, :]  # CLS token

        # Concatenate with linguistic features
        combined_features = torch.cat([text_embedding, linguistic_features], dim=1)

        # Classification
        logits = self.fusion(combined_features)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return torch.nn.functional.softmax(logits, dim=1) if loss is None else (loss, logits)

# Initialize the model
model = CombinedPersonalityModel(
    model_name=model_name,
    linguistic_feature_size=X_train_features.shape[1],
    num_labels=len(np.unique(labels))
)

# Define a custom Trainer to handle your combined inputs
class CombinedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            linguistic_features=inputs["linguistic_features"],
            labels=labels
        )

        loss = outputs[0] if isinstance(outputs, tuple) else outputs
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir="./combined_personality_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Set up trainer
trainer = CombinedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer.train()
model_save_path = "./combined_personality_model_final"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

In [None]:
#to load and use the model

from transformers import AutoTokenizer
import torch

# Load the saved model and tokenizer
model_path = "./combined_personality_model_final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = CombinedPersonalityModel.from_pretrained(model_path)

# Set to evaluation mode
model.eval()

# Function to make predictions with the loaded model
def predict_personality(text, linguistic_features):
    # Tokenize
    inputs = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Convert linguistic features to tensor
    features_tensor = torch.tensor([linguistic_features], dtype=torch.float)

    # Get prediction
    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            linguistic_features=features_tensor,
            labels=None
        )

    # Get predicted class
    prediction = torch.argmax(outputs, dim=1).item()
    return prediction

# Minej/bert-base-personality

https://huggingface.co/Minej/bert-base-personality

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments

# Load tokenizer and encoder
model_name = "Minej/bert-base-personality"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_encoder = BertModel.from_pretrained(model_name)

# --- Per-word normalization of linguistic features ---
df["word_count"] = df["text"].apply(lambda x: len(str(x).split()))
# List of raw features
raw_feats = ["hedge_count", "verb_count", "article_count", "sentiment_score"]
# Create per-word versions
for col in raw_feats:
    df[col + "_per_word"] = df[col] / df["word_count"].clip(lower=1)

# Final features to use
feature_cols = [f + "_per_word" for f in raw_feats]
linguistic_features = df[feature_cols].values

# Optional: scale features for stability
scaler = StandardScaler()
linguistic_features = scaler.fit_transform(linguistic_features)

# Labels: multi-label binarized columns
label_cols = ['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']
labels = df[label_cols].values

# Text
texts = df["text"].values

# --- Dataset class ---
class CombinedPersonalityDataset(Dataset):
    def __init__(self, texts, linguistic_features, labels, tokenizer, max_length=128):
        self.texts = texts
        self.features = linguistic_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["linguistic_features"] = torch.tensor(self.features[idx], dtype=torch.float)
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# --- Model class ---
class CombinedPersonalityModel(torch.nn.Module):
    def __init__(self, bert_model, linguistic_feature_size, num_traits=5):
        super().__init__()
        self.text_encoder = bert_model
        self.text_dim = self.text_encoder.config.hidden_size

        self.fusion = torch.nn.Sequential(
            torch.nn.Linear(self.text_dim + linguistic_feature_size, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(256, num_traits)
        )

    def forward(self, input_ids, attention_mask, linguistic_features, labels=None):
        with torch.no_grad():  # Freeze BERT if desired
            outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS]

        combined = torch.cat([cls_embedding, linguistic_features], dim=1)
        logits = self.fusion(combined)

        loss = None
        if labels is not None:
            loss_fn = torch.nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)

        return (loss, logits) if loss is not None else torch.sigmoid(logits)

# --- Split data ---
X_train, X_test, F_train, F_test, Y_train, Y_test = train_test_split(
    texts, linguistic_features, labels, test_size=0.2, random_state=42
)

train_dataset = CombinedPersonalityDataset(X_train, F_train, Y_train, tokenizer)
test_dataset = CombinedPersonalityDataset(X_test, F_test, Y_test, tokenizer)

# --- Trainer setup ---
class CombinedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        loss, logits = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            linguistic_features=inputs["linguistic_features"],
            labels=labels
        )
        return (loss, (logits, labels)) if return_outputs else loss

def compute_metrics(eval_pred):
    from sklearn.metrics import f1_score, accuracy_score
    logits, labels = eval_pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    return {
        "f1": f1_score(labels, preds, average="macro"),
        "accuracy": accuracy_score(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="./minej_personality_model",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
)

# --- Train ---
model = CombinedPersonalityModel(
    bert_model=bert_encoder,
    linguistic_feature_size=F_train.shape[1],
    num_traits=labels.shape[1]
)

trainer = CombinedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# --- Save ---
trainer.save_model("./minej_personality_combined_model")
tokenizer.save_pretrained("./minej_personality_combined_model")


KeyboardInterrupt: 

# Flan-T5

Saw a comment that this works well and wanted to try it out. Does not work without fine-tuning.

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def prompt_personality(text):
    prompt = f"""Given this text: \"{text}\", classify the user's personality based on the Big Five traits (Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism). Provide a list of the top 5 most prominent traits with a brief explanation for each."""
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
prompt_personality("I have been presented with many situations that put me outside of my comfort zone while living abroad. One of it's key aspects is the inevitable confrontation with language conflict. Even if you speak a bit of language, encountering and interacting with the locals will remain challenging. These situations have made me more adaptable, more receptive to what words and explanations are more easily understood. One of the ways to overcome this barrier was getting to know the key cultural concepts or elements that people from a specific country/region/city know, may it be a specific word, a song, a movie, a saying, random fact about the local football team - all these things seem to create a connection, as it shows that you care and are curious about the people surrounding you. I have recently started volunteering at a cultural center in which my tasks include bartending. I have no previous experience of bartending and the majority of visitors speak a language that I do not know. Even when I applied for this position I knew that I will inevitably makes mistakes, just as one always does when starting a new thing. During my first shifts, I miscalculated prices, poured drinks to the wrong cups, had to deal with foamy and fussy beers and had to learn how to deal with less satisfied visitors.This experience, together with many other volunteering opportunities that I had done in the past, mainly taught to not be afraid of making mistakes, as it is the only way to learn. Additionally, I have noticed is that in such instances, I always have to slow myself down, calm my mind and pay attention to what I am doing, this way not letting the stress in and helping me learn and perform better. Finally and perhaps most importantly, the place and the people determine the actual experience and it's perception. If you are in a good place, that is welcoming and understanding, where you feel appreciated, you are less likely to make mistakes and even if you do, the lesson will be learned much more quickly. Perhaps the most challenging leadership role I had to take on was becoming a curator for my previous university's student representations group. My task included finding students who would represent their courses and would try to improve the study experience in the meetings with professors. The main issue I encountered the initial fear of reaching out to people and the perception of my role itself. The student group I was in, was often acting as a very bureaucratic entity that would not make any impactful change and I felt that some of the students had the same view of me.")

'Openness Conscientiousness Extraversion Agreeableness Neuroticism'