## Models experiment

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import torch
import torch.nn as nn
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from transformers import (
    AutoModel, AutoConfig, AutoTokenizer,
    AdamW, get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer, TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = pd.read_csv('preprocessed_text_df.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3006566 entries, 0 to 3006565
Data columns (total 18 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   author                    object 
 1   body                      object 
 2   word_count                float64
 3   word_count_quoteless      float64
 4   lang                      object 
 5   N_exaggeration_count      int64  
 6   A_hedges                  float64
 7   agreeableness             float64
 8   openness                  float64
 9   conscientiousness         float64
 10  extraversion              float64
 11  neuroticism               float64
 12  agreeableness_scaled      int64  
 13  openness_scaled           int64  
 14  conscientiousness_scaled  int64  
 15  extraversion_scaled       int64  
 16  neuroticism_scaled        int64  
 17  clean_text                object 
dtypes: float64(8), int64(6), object(4)
memory usage: 412.9+ MB


In [7]:
# Convert columns to the correct data types
data["author"] = data["author"].astype(str)
data["clean_text"] = data["clean_text"].astype(str)
data["agreeableness"] = data["agreeableness"].astype(float)
data["openness"] = data["openness"].astype(float)
data["conscientiousness"] = data["conscientiousness"].astype(float)
data["extraversion"] = data["extraversion"].astype(float)
data["neuroticism"] = data["neuroticism"].astype(float)

# Remove rows with missing values
data = data.dropna(subset=["author", "clean_text", "agreeableness", "openness", "conscientiousness", "extraversion", "neuroticism"])

# # Group comments by user and calculate average personality scores
# user_data = df[df["author"].isin(valid_users)].groupby("author").agg({
#     "clean_text": " ".join,
#     "agreeableness": "mean",
#     "openness": "mean",
#     "conscientiousness": "mean",
#     "extraversion": "mean",
#     "neuroticism": "mean"
# }).reset_index()


In [8]:
# Set the desired number of comments to be sampled per user
comments_per_user = 100 #400

# Set the minimum number of words threshold
min_comment_length = 5

# Group the data by user_id
grouped_data = data.groupby('author')

# Initialize an empty DataFrame to store the sampled data
sampled_data = pd.DataFrame(columns=data.columns)

# Iterate over each user group
for user_id, group in grouped_data:
    # Filter comments based on the minimum number of words threshold
    filtered_comments = group[group['clean_text'].apply(lambda x: len(x.split())) >= min_comment_length]
    # Sample up to 'comments_per_user' comments
    sampled_group = filtered_comments.sample(min(len(filtered_comments), comments_per_user))
    # Append the sampled group to the sampled data
    sampled_data = sampled_data.append(sampled_group, ignore_index=True)

# Shuffle the sampled data to mix comments from different users
sampled_data = sampled_data.sample(frac=1).reset_index(drop=True)


  sampled_data = sampled_data.append(sampled_group, ignore_index=True)


In [9]:
sampled_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124464 entries, 0 to 124463
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   author                    124464 non-null  object 
 1   body                      124464 non-null  object 
 2   word_count                124464 non-null  float64
 3   word_count_quoteless      124464 non-null  float64
 4   lang                      124359 non-null  object 
 5   N_exaggeration_count      124464 non-null  object 
 6   A_hedges                  124464 non-null  float64
 7   agreeableness             124464 non-null  float64
 8   openness                  124464 non-null  float64
 9   conscientiousness         124464 non-null  float64
 10  extraversion              124464 non-null  float64
 11  neuroticism               124464 non-null  float64
 12  agreeableness_scaled      124464 non-null  object 
 13  openness_scaled           124464 non-null  o

### First experiment with Bert model

In [10]:
# Define the BigFiveDataset class
class BigFiveDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.loc[idx, 'clean_text']
        labels = self.data.loc[idx, ['agreeableness', 'openness', 'conscientiousness', 'extraversion', 'neuroticism']].values.astype(float)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        sample = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

        return sample


In [11]:
# Set up the model, tokenizer, and configuration
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
train_data, test_data = train_test_split(sampled_data, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


In [13]:
test_data

Unnamed: 0,author,body,word_count,word_count_quoteless,lang,N_exaggeration_count,A_hedges,agreeableness,openness,conscientiousness,extraversion,neuroticism,agreeableness_scaled,openness_scaled,conscientiousness_scaled,extraversion_scaled,neuroticism_scaled,clean_text
0,Fearless_Prince,I avoid mirrors like the goddamn plague becaus...,207.0,161.0,en,0,0.006211,50.0,87.0,89.0,49.0,4.0,3,5,5,3,1,i avoid mirror like the goddamn plague because...
1,Lifeisfallingapart,You can't see much of anything. Just BJ and a...,14.0,14.0,en,0,0.000000,65.0,61.0,8.0,4.0,98.0,4,4,1,1,5,you cannot see much of anything just bj and al...
2,5il3nc3r,Pretty sure Ninja is the fastest. Swashbuckler...,12.0,12.0,en,0,0.000000,77.0,9.0,54.0,3.0,32.0,4,1,3,1,2,pretty sure ninja is the fastest swashbuckler ...
3,laidymondegreen,Maybe they do it after you show yourself out.,9.0,9.0,en,0,0.111111,91.0,61.0,21.0,42.0,32.0,5,4,2,3,2,maybe they do it after you show yourself out
4,9Hero,A SJW in its natural habitat.,6.0,6.0,en,0,0.000000,8.0,75.0,1.0,14.0,95.0,1,4,1,1,5,a sjw in it natural habitat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24888,turncloak471,I agree that trying to be tactful or decisive ...,68.0,68.0,en,0,0.029412,6.0,91.0,88.0,15.0,4.0,1,5,5,1,1,i agree that trying to be tactful or decisive ...
24889,nrgstorm,The last time I was up there was shortly after...,38.0,37.0,en,0,0.000000,28.0,57.0,69.0,2.0,72.0,2,3,4,1,4,the last time i wa up there wa shortly after t...
24890,AbstractStateMachine,Oh shit did I forget to charge the spare batte...,11.0,11.0,en,0,0.000000,80.0,92.0,25.0,84.0,78.0,4,5,2,5,4,oh shit did i forget to charge the spare batte...
24891,d4m1t,I love your wallpaper! Link please?,6.0,6.0,en,0,0.000000,34.0,84.0,5.0,82.0,27.0,2,5,1,5,2,i love your wallpaper link please


In [14]:
max_length = 100

In [15]:
# Create the custom datasets
train_dataset = BigFiveDataset(train_data, tokenizer, max_length)
test_dataset = BigFiveDataset(test_data, tokenizer, max_length)

In [16]:
# Create the data loaders
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False
)

In [17]:
# # # !pip install --upgrade accelerate
# !pip uninstall -y transformers accelerate
# !pip install transformers accelerate

In [18]:
# !pip install git+https://github.com/huggingface/accelerate
# !pip install transformers==4.28.0

In [19]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch"
)

In [20]:
# Define the regression model
class BigFiveModel(nn.Module):
    def __init__(self, model):
        super(BigFiveModel, self).__init__()
        self.model = model
        self.regression = nn.Linear(config.hidden_size, 5)  # 5 traits

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.regression(pooled_output)
        return logits


In [21]:
# Instantiate the regression model
regression_model = BigFiveModel(model)

# Define the MSE loss function
loss_fn = nn.MSELoss()

# Create the optimizer and scheduler
optimizer = AdamW(regression_model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=len(train_dataloader) * training_args.num_train_epochs
)




In [22]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regression_model.to(device)

# Training loop
regression_model.train()

for epoch in range(training_args.num_train_epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device).long()  # Convert to Long
        attention_mask = batch['attention_mask'].to(device).float()
        labels = batch['labels'].to(device).float()

        optimizer.zero_grad()

        logits = regression_model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()



KeyboardInterrupt



In [None]:
# Evaluation loop
regression_model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = regression_model(input_ids, attention_mask)
        predictions.extend(logits.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

mse = mean_squared_error(true_labels, predictions)
r2 = r2_score(true_labels, predictions)

print("MSE:", mse)
print("R2:", r2)

#### Second experiment with Glove, SVD and Bert, Multi regression model

In [None]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(sampled_data, test_size=0.2, random_state=42)

In [None]:
# Calculate the percentile sequence length that covers 80% of the comment lengths
percentile = 90  # Set the desired percentile value
sequence_lengths = sampled_data['clean_text'].apply(lambda x: len(x.split()))
sequence_length_threshold = np.percentile(sequence_lengths, percentile)

# Determine the maximum sequence length needed to cover the specified percentile
max_sequence_length = int(sequence_length_threshold)

In [None]:
max_sequence_length

112

In [None]:
# Decided to go on fixed param
# Define constants
max_sequence_length = 100  # Maximum sequence length for padding/truncation
embedding_dim = 100  # Dimensionality of GLOVE word embeddings
num_components = 50  # Number of components for SVD

In [None]:
# Tokenize text using GLOVE word embeddings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data["clean_text"])

In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data["clean_text"])
test_sequences = tokenizer.texts_to_sequences(test_data["clean_text"])

In [None]:
# Pad sequences to the maximum sequence length
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

In [None]:
# Load GLOVE word embeddings
glove_embeddings = {}
with open("glove.6B.100d.txt", encoding="utf8") as file:  # Replace with path to your GLOVE file
    for line in file:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype="float32")
        glove_embeddings[word] = embedding


In [None]:
# Create an embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.046539  ,  0.61966002,  0.56647003, ..., -0.37616   ,
        -0.032502  ,  0.80620003],
       ...,
       [-0.13805   ,  0.55756003, -0.80915999, ...,  0.14061999,
         0.33471999, -0.18887   ],
       [ 0.0022    , -0.29567999,  0.77519   , ...,  1.12779999,
         1.76489997,  1.07439995],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
# Apply SVD for dimensionality reduction
svd = TruncatedSVD(n_components=num_components, random_state=42)
train_features = svd.fit_transform(train_sequences)
test_features = svd.transform(test_sequences)

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Define batch size and maximum sequence length per batch
batch_size = 32
max_batch_sequence_length = 128

In [None]:
train_bert_outputs = []
num_train_batches = int(np.ceil(len(train_data) / batch_size))
for i in range(num_train_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(train_data))
    train_batch = train_data.iloc[start_idx:end_idx]
    train_encoded_batch = tokenizer.batch_encode_plus(
        train_batch["clean_text"].tolist(),
        truncation=True,
        padding=True,
        max_length=max_batch_sequence_length,
        return_tensors="tf",
        return_token_type_ids=True  # Include token_type_ids for segment differentiation
    )
    train_bert_inputs_batch = {
        "input_ids": train_encoded_batch["input_ids"],
        "attention_mask": train_encoded_batch["attention_mask"],
        "token_type_ids": train_encoded_batch["token_type_ids"]
    }
    train_bert_outputs_batch = bert_model(train_bert_inputs_batch)[0]
    train_bert_outputs.append(train_bert_outputs_batch)


KeyboardInterrupt



In [None]:
train_bert_outputs = np.concatenate(train_bert_outputs, axis=0)

In [None]:
# Process test data in batches
test_bert_outputs = []
num_test_batches = int(np.ceil(len(test_data) / batch_size))
for i in range(num_test_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(test_data))
    test_batch = test_data.iloc[start_idx:end_idx]
    test_encoded_batch = tokenizer.batch_encode_plus(
        test_batch["clean_text"].tolist(),
        truncation=True,
        padding=True,
        max_length=max_batch_sequence_length,
        return_tensors="tf",
        return_token_type_ids=True  # Include token_type_ids for segment differentiation
    )
    test_bert_inputs_batch = {
        "input_ids": test_encoded_batch["input_ids"],
        "attention_mask": test_encoded_batch["attention_mask"],
        "token_type_ids": test_encoded_batch["token_type_ids"]
    }
    test_bert_outputs_batch = bert_model(test_bert_inputs_batch)[0]
    test_bert_outputs.append(test_bert_outputs_batch)


In [None]:
test_bert_outputs = np.concatenate(test_bert_outputs, axis=0)

In [None]:
# Get BERT embeddings
train_bert_inputs = [train_encoded["input_ids"], train_encoded["attention_mask"]]
train_bert_outputs = bert_model(train_bert_inputs)[0]
test_bert_inputs = [test_encoded["input_ids"], test_encoded["attention_mask"]]
test_bert_outputs = bert_model(test_bert_inputs)[0]

In [None]:
# Define the model architecture
input_glove = Input(shape=(num_components,), name="glove_input")
input_bert = Input(shape=(max_sequence_length, 768), name="bert_input")

In [None]:
# GLOVE branch
glove_layer = Dense(128, activation="relu")(input_glove)

# BERT branch
bert_lstm = LSTM(128, return_sequences=True)(input_bert)
bert_layer = LSTM(64)(bert_lstm)

# Merge branches
merged_layer = concatenate([glove_layer, bert_layer])
output_layer1 = Dense(1, name="output_openness")(merged_layer)
output_layer2 = Dense(1, name="output_conscientiousness")(merged_layer)
output_layer3 = Dense(1, name="output_extraversion")(merged_layer)
output_layer4 = Dense(1, name="output_agreeableness")(merged_layer)
output_layer5 = Dense(1, name="output_neuroticism")(merged_layer)

# Create the model
model = Model(inputs=[input_glove, input_bert], outputs=[output_layer1, output_layer2, output_layer3,
                                                          output_layer4, output_layer5])

# Compile the model
model.compile(optimizer=Adam(), loss=MeanSquaredError())

# Define the checkpoint path
checkpoint_path = "model_checkpoints/model_checkpoint.h5"

# Create a ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True,
                                      monitor='val_loss', save_best_only=True)

# Train the model with the checkpoint callback
model.fit([train_features, train_bert_outputs], [train_data["openness"], train_data["conscientiousness"],
                                                 train_data["extraversion"], train_data["agreeableness"],
                                                 train_data["neuroticism"]],
          epochs=10, batch_size=32, validation_split=0.2, callbacks=[checkpoint_callback])

# Evaluate the model
predictions = model.predict([test_features, test_bert_outputs])
mse = mean_squared_error([test_data["openness"], test_data["conscientiousness"],
                          test_data["extraversion"], test_data["agreeableness"],
                          test_data["neuroticism"]], predictions)
print("MSE:", mse)