In [8]:
import pandas as pd

# Load the dataset (replace with your dataset file)
data = pd.read_csv("mypersonality_final.csv", encoding="latin1")

# Remove the first set of personality columns
data = data.drop(columns=["sEXT", "sNEU", "sAGR", "sCON", "sOPN"])

# Remove the additional columns
columns_to_remove = ["#AUTHID","NETWORKSIZE", "BETWEENNESS", "NBETWEENNESS", "DENSITY", "BROKERAGE", "NBROKERAGE", "TRANSITIVITY"]
data.drop(columns=columns_to_remove, inplace=True)

# Replace 'n' with 0 and 'y' with 1 for the remaining columns
columns_to_replace = ["cEXT", "cNEU", "cAGR", "cCON", "cOPN"]
data[columns_to_replace] = data[columns_to_replace].replace({'n': 0, 'y': 1})

# Remove the "c" from the columns "cEXT", "cNEU", "cAGR", "cCON", "cOPN"
data.columns = data.columns.str.replace('c', '')

# Remove the "DATE" column
data = data.drop(columns=["DATE"])

# Print the modified dataset
print(data.head())

data.head(15)



                                              STATUS  EXT  NEU  AGR  CON  OPN
0                        likes the sound of thunder.    0    1    0    0    1
1  is so sleepy it's not even funny that's she ca...    0    1    0    0    1
2  is sore and wants the knot of muscles at the b...    0    1    0    0    1
3         likes how the day sounds in this new song.    0    1    0    0    1
4                                        is home. <3    0    1    0    0    1


Unnamed: 0,STATUS,EXT,NEU,AGR,CON,OPN
0,likes the sound of thunder.,0,1,0,0,1
1,is so sleepy it's not even funny that's she ca...,0,1,0,0,1
2,is sore and wants the knot of muscles at the b...,0,1,0,0,1
3,likes how the day sounds in this new song.,0,1,0,0,1
4,is home. <3,0,1,0,0,1
5,www.thejokerblogs.com,0,1,0,0,1
6,"saw a nun zombie, and liked it. Also, *PROPNAM...",0,1,0,0,1
7,is in Kentucky. 421 miles into her 1100 mile j...,0,1,0,0,1
8,was about to finish a digital painting before ...,0,1,0,0,1
9,is celebrating her new haircut by listening to...,0,1,0,0,1


In [23]:
data.shape

(9917, 6)

In [10]:
#I have imported the necessary libraries and packages, including Pandas for data handling, 
#Transformers for BERT integration, TensorFlow and Keras for deep learning, and other utilities.
import pandas as pd
from transformers import TFBertModel, BertTokenizer
import os
import re
import random
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import auc, roc_curve
import matplotlib.pyplot as plt

In [11]:
# Set a seed value for reproducibility
seed_value = 29
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

In [12]:
# Check for GPU availability
#device_name = tf.test.gpu_device_name()
#if device_name != '/device:GPU:0':
#    raise SystemError('GPU device not found')
#print('Found GPU at: {}'.format(device_name))

SystemError: GPU device not found

In [13]:
# The number of personality axes and the max sequance lenght was set for the BERT tokenization. 
N_AXIS = 5  # You have 5 axes: EXT, NEU, AGR, CON, and OPN
MAX_SEQ_LEN = 128
BERT_NAME = 'bert-base-uncased'#base is a smaller and hence a computationally efficient version of BERT 
# compared to 'large' variants; 'uncased'means that the model is trained on text where all the words are in lowercase. 

In [18]:
axes = ["EXT", "NEU", "AGR", "CON", "OPN"]

In [19]:
# Define the text preprocessing function for eliminating square brackets, links, numbers, and emojis etc
def text_preprocessing(text):
    text = text.lower()#converts all characters in the input text to lowercase
    text = re.sub('\[.*?\]', '', text)#remove text enclosed in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text)#removes websites 
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text.encode('ascii', 'ignore').decode('ascii')
    if text.startswith("'"):
        text = text[1:-1]
    return text

In [20]:
data = data.sample(frac=1)  # Shuffle the data;frac=1 means that you want to sample (shuffle) the entire DataFrame, 
#which is equivalent to shuffling all the row
#By shuffling the data, you randomize the order in which the samples are presented during training, which can help 
#the model generalize better and avoid learning patterns based on the input order

In [21]:
# Extract the "STATUS" column and personality traits
sentences = data["STATUS"]
personality_traits = ["EXT", "NEU", "AGR", "CON", "OPN"]
labels = data[personality_traits]

In [24]:
# Define the ratios for train, validation, and test splits (adjust these values as needed)
train_ratio = 0.70  # 70% for training
val_ratio = 0.15   # 15% for validation
test_ratio = 0.15  # 15% for testing

In [25]:
# calculate the number of samples that will be allocated for training, validation, and testing when 
# splitting your dataset into these three subsets
total_samples = len(data)
train_samples = int(train_ratio * total_samples)
val_samples = int(val_ratio * total_samples)
test_samples = int(test_ratio * total_samples)

In [26]:
# Split your data into train, validation, and test sets
train_sentences = sentences[:train_samples]
y_train = labels[:train_samples]
val_sentences = sentences[train_samples:train_samples + val_samples]
y_val = labels[train_samples:train_samples + val_samples]
test_sentences = sentences[train_samples + val_samples:train_samples + val_samples + test_samples]
y_test = labels[train_samples + val_samples:train_samples + val_samples + test_samples]

In [27]:
# Define a function to prepare BERT input
def prepare_bert_input(sentences, seq_len, bert_name):
    tokenizer = BertTokenizer.from_pretrained(bert_name)#The tokenizer is responsible for converting text data into a format that BERT can understand.
    encodings = tokenizer(sentences.tolist(), truncation=True, padding='max_length',
                          max_length=seq_len)
    input = [np.array(encodings["input_ids"]), np.array(encodings["token_type_ids"]),
             np.array(encodings["attention_mask"])]
    return input

In [28]:
#preparing the input data for your BERT-based neural network model
X_train = prepare_bert_input(train_sentences, MAX_SEQ_LEN, BERT_NAME)
X_val = prepare_bert_input(val_sentences, MAX_SEQ_LEN, BERT_NAME)
X_test = prepare_bert_input(test_sentences, MAX_SEQ_LEN, BERT_NAME)

In [29]:
# Define the model architecture
input_ids = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='input_ids')
input_type = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='token_type_ids')
input_mask = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='attention_mask')
inputs = [input_ids, input_type, input_mask]
bert = TFBertModel.from_pretrained(BERT_NAME)
bert_outputs = bert(inputs)
last_hidden_states = bert_outputs.last_hidden_state
avg = layers.GlobalAveragePooling1D()(last_hidden_states)
output = layers.Dense(N_AXIS, activation="sigmoid")(avg)
model = keras.Model(inputs=inputs, outputs=output)
model.summary()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceCl

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'token_type_ids[0][0]',     

In [31]:
# Load the BERT model and set its layers as non-trainable
bert = TFBertModel.from_pretrained(BERT_NAME)
bert.trainable = False  # Set BERT layers as non-trainable

# Define the rest of the model architecture
bert_outputs = bert(inputs)
last_hidden_states = bert_outputs.last_hidden_state
avg = layers.GlobalAveragePooling1D()(last_hidden_states)
output = layers.Dense(N_AXIS, activation="sigmoid")(avg)
model = keras.Model(inputs=inputs, outputs=output)
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'token_type_ids[0][0]',   

In [None]:
#Layers: provides a list of all the layers in the model, including input layers(3nos)
#BERT layers:responsible for processing the input text data and extracting contextual embeddings. It has a large number of parameters (109,482,240), 
#which are the weights learned during pre-training
#Glabal average output layer:The output shape is (None, 768), which means it reduces the sequence length dimension to 768 while retaining all samples in the batch
#Dense output layer: It takes the 768-dimensional output from the global average pooling layer and produces a 5-dimensional output
#non trainable belong to the pre-trained BERT model, and their values are fixed

In [33]:
# End-to-end fine-tuning
# Define optimizer, loss, and compile the model
max_epochs = 7
batch_size = 32 #Smaller batch sizes, like 32, can be more computationally efficient because they require less memory (16,32, 64, etc are other batch)
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)#Adam optimizer is commonly used for training deep learning models
loss = tf.keras.losses.BinaryCrossentropy()#loss function used to compute the model's error during training
best_weights_file = "weights.h5"
auc = tf.keras.metrics.AUC(multi_label=True, curve="ROC")
m_ckpt = ModelCheckpoint(best_weights_file, monitor='val_'+auc.name, mode='max', verbose=2,
                          save_weights_only=True, save_best_only=True)
model.compile(loss=loss, optimizer=opt, metrics=[auc, tf.keras.metrics.BinaryAccuracy()])

# Training code remains the same
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=max_epochs,
    batch_size=batch_size,
    callbacks=[m_ckpt],
    verbose=2
)

Epoch 1/7

Epoch 1: val_auc_2 improved from -inf to 0.48536, saving model to weights.h5
217/217 - 1365s - loss: 0.6891 - auc_2: 0.4957 - binary_accuracy: 0.5490 - val_loss: 0.6754 - val_auc_2: 0.4854 - val_binary_accuracy: 0.5847 - 1365s/epoch - 6s/step
Epoch 2/7

Epoch 2: val_auc_2 improved from 0.48536 to 0.48865, saving model to weights.h5
217/217 - 1379s - loss: 0.6626 - auc_2: 0.4979 - binary_accuracy: 0.6007 - val_loss: 0.6672 - val_auc_2: 0.4886 - val_binary_accuracy: 0.5841 - 1379s/epoch - 6s/step
Epoch 3/7

Epoch 3: val_auc_2 improved from 0.48865 to 0.48908, saving model to weights.h5
217/217 - 1395s - loss: 0.6609 - auc_2: 0.4980 - binary_accuracy: 0.6011 - val_loss: 0.6673 - val_auc_2: 0.4891 - val_binary_accuracy: 0.5861 - 1395s/epoch - 6s/step
Epoch 4/7

Epoch 4: val_auc_2 improved from 0.48908 to 0.49004, saving model to weights.h5
217/217 - 1319s - loss: 0.6606 - auc_2: 0.4998 - binary_accuracy: 0.6014 - val_loss: 0.6663 - val_auc_2: 0.4900 - val_binary_accuracy: 0.5861

<keras.callbacks.History at 0x20c29b65a30>

In [38]:
# Evaluate the model on the test set
model.load_weights(best_weights_file)
model.compile(loss=loss, optimizer=opt, metrics=[tf.keras.metrics.AUC(multi_label=True, curve="ROC"),
                                                  tf.keras.metrics.BinaryAccuracy()])
predictions = model.predict(X_test)
model.evaluate(X_test, y_test, batch_size=32)



[0.659368634223938, 0.49841880798339844, 0.5978479981422424]

In [37]:
# As a final step, you can use the model to predict personality traits for new sentences
# Replace the example sentence with your own input if needed
s1 = "The food was not only very tasty and filling, but the portions were impressively large. Everything about our dining experience was exceptional, and the pictures speak for themselves. We have every intention of returning, especially given the reasonable price of €43 for three main courses, two beers, and a delightful dessert on the house."

sentences = np.asarray([s1])
enc_sentences = prepare_bert_input(sentences, MAX_SEQ_LEN, BERT_NAME)
predictions = model.predict(enc_sentences)

# Decode the predictions (0 or 1) into personality traits
pred_traits = [axes[i] for i in range(len(axes)) if predictions[0][i] >= 0.5]
print("Predicted Personality Traits:", pred_traits)

Predicted Personality Traits: ['AGR', 'OPN']
