In [2]:
import pandas as pd
import string
import re
import nltk
import torch
from transformers import BertModel, BertTokenizer
import sklearn
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('data/personality.csv')

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a single string
    processed_text = ' '.join(tokens)

    return processed_text


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dimit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dimit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
data['processed_post'] = data['post_extrovert'].apply(preprocess_text)
data

Unnamed: 0,auhtor_ID,post_extrovert,extrovert,post_feeling,feeling,post_judging,judging,post_sensing,sensing,processed_post
0,t2_12bhu7,I wear a Lorna shore shirt out alot in public ...,1.0,I wear a Lorna shore shirt out alot in public ...,1.0,I wear a Lorna shore shirt out alot in public ...,0.0,I wear a Lorna shore shirt out alot in public ...,0.0,wear lorna shore shirt alot public lewd long s...
1,t2_12jbpd,I'd say this is a very accurate characterizati...,1.0,I'd say this is a very accurate characterizati...,0.0,I'd say this is a very accurate characterizati...,0.0,I'd say this is a very accurate characterizati...,0.0,id say accurate characterization ni users read...
2,t2_12uwr5,Ya know like most people with home decorations...,0.0,Ya know like most people with home decorations...,0.0,Ya know like most people with home decorations...,1.0,Ya know like most people with home decorations...,0.0,ya know like people home decorations could sav...
3,t2_12zm15,It's true tho. They're kinda more interesting ...,0.0,It's true tho. They're kinda more interesting ...,1.0,It's true tho. They're kinda more interesting ...,0.0,It's true tho. They're kinda more interesting ...,0.0,true tho theyre kinda interesting buuuut issue...
4,t2_13cjjl,"Yeah, but that's one of the things that make m...",0.0,"Yeah, but that's one of the things that make m...",0.0,"Yeah, but that's one of the things that make m...",0.0,"Yeah, but that's one of the things that make m...",1.0,yeah thats one things make better objectively ...
...,...,...,...,...,...,...,...,...,...,...
150,t2_vfp8y,so change profession then. this would be inadm...,0.0,so change profession then. this would be inadm...,0.0,so change profession then. this would be inadm...,1.0,so change profession then. this would be inadm...,0.0,change profession would inadmissible country p...
151,t2_w0842,The technological singularity. And the possibi...,0.0,The technological singularity. And the possibi...,0.0,The technological singularity. And the possibi...,1.0,The technological singularity. And the possibi...,0.0,technological singularity possibility contribu...
152,t2_w6rgl,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,Dear God man. Chill. I'm not Einstein or Hawki...,1.0,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,dear god man chill im einstein hawking serious...
153,t2_wilcwvo,That's what a fake lib would say [Human blood ...,1.0,That's what a fake lib would say [Human blood ...,0.0,That's what a fake lib would say [Human blood ...,0.0,That's what a fake lib would say [Human blood ...,0.0,thats fake lib would say human blood water url...


In [6]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to obtain BERT embeddings for a text
def get_bert_embeddings(text):
    # Tokenize input text and convert to tensor
    tokens = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt', max_length=512, truncation=True)

    # Get BERT embeddings
    with torch.no_grad():
        outputs = bert_model(tokens)
        embeddings = outputs.last_hidden_state

    # Average the embeddings across tokens (you can modify this based on your needs)
    avg_embedding = torch.mean(embeddings, dim=1).squeeze().numpy()

    return avg_embedding

# # Example DataFrame with a 'processed_post' column
# data = {'processed_post': ["enjoy hiking spending time nature",
#                             "text data preprocessing crucial nlp tasks",
#                             "stop words removal improves text analysis"]}
# df = pd.DataFrame(data)

# # Apply BERT embeddings to the 'processed_post' column
# df['bert_embeddings'] = df['processed_post'].apply(get_bert_embeddings)

# # Display the DataFrame with processed text and BERT embeddings
# print(df)

data['bert_embeddings'] = data['post_extrovert'].apply(get_bert_embeddings)
print(data)


      auhtor_ID                                     post_extrovert  extrovert  \
0     t2_12bhu7  I wear a Lorna shore shirt out alot in public ...        1.0   
1     t2_12jbpd  I'd say this is a very accurate characterizati...        1.0   
2     t2_12uwr5  Ya know like most people with home decorations...        0.0   
3     t2_12zm15  It's true tho. They're kinda more interesting ...        0.0   
4     t2_13cjjl  Yeah, but that's one of the things that make m...        0.0   
..          ...                                                ...        ...   
150    t2_vfp8y  so change profession then. this would be inadm...        0.0   
151    t2_w0842  The technological singularity. And the possibi...        0.0   
152    t2_w6rgl  Dear God man. Chill. I'm not Einstein or Hawki...        0.0   
153  t2_wilcwvo  That's what a fake lib would say [Human blood ...        1.0   
154   t2_zq7gkv  My biggest problem is asking for it. I don’t n...        1.0   

                           

In [7]:
data

Unnamed: 0,auhtor_ID,post_extrovert,extrovert,post_feeling,feeling,post_judging,judging,post_sensing,sensing,processed_post,bert_embeddings
0,t2_12bhu7,I wear a Lorna shore shirt out alot in public ...,1.0,I wear a Lorna shore shirt out alot in public ...,1.0,I wear a Lorna shore shirt out alot in public ...,0.0,I wear a Lorna shore shirt out alot in public ...,0.0,wear lorna shore shirt alot public lewd long s...,"[0.037400726, 0.03744322, 0.40402812, -0.15458..."
1,t2_12jbpd,I'd say this is a very accurate characterizati...,1.0,I'd say this is a very accurate characterizati...,0.0,I'd say this is a very accurate characterizati...,0.0,I'd say this is a very accurate characterizati...,0.0,id say accurate characterization ni users read...,"[-0.12263443, 0.06978707, 0.23516819, -0.16746..."
2,t2_12uwr5,Ya know like most people with home decorations...,0.0,Ya know like most people with home decorations...,0.0,Ya know like most people with home decorations...,1.0,Ya know like most people with home decorations...,0.0,ya know like people home decorations could sav...,"[0.1035808, -0.07981775, 0.48626775, 0.0068327..."
3,t2_12zm15,It's true tho. They're kinda more interesting ...,0.0,It's true tho. They're kinda more interesting ...,1.0,It's true tho. They're kinda more interesting ...,0.0,It's true tho. They're kinda more interesting ...,0.0,true tho theyre kinda interesting buuuut issue...,"[-0.11131683, 0.070212886, 0.51686287, 0.01720..."
4,t2_13cjjl,"Yeah, but that's one of the things that make m...",0.0,"Yeah, but that's one of the things that make m...",0.0,"Yeah, but that's one of the things that make m...",0.0,"Yeah, but that's one of the things that make m...",1.0,yeah thats one things make better objectively ...,"[0.21926501, 0.11031015, 0.2861948, 0.10730274..."
...,...,...,...,...,...,...,...,...,...,...,...
150,t2_vfp8y,so change profession then. this would be inadm...,0.0,so change profession then. this would be inadm...,0.0,so change profession then. this would be inadm...,1.0,so change profession then. this would be inadm...,0.0,change profession would inadmissible country p...,"[-0.17524561, 0.18416233, 0.44777521, -0.11897..."
151,t2_w0842,The technological singularity. And the possibi...,0.0,The technological singularity. And the possibi...,0.0,The technological singularity. And the possibi...,1.0,The technological singularity. And the possibi...,0.0,technological singularity possibility contribu...,"[-0.025105802, -0.08084059, 0.3463778, -0.0189..."
152,t2_w6rgl,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,Dear God man. Chill. I'm not Einstein or Hawki...,1.0,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,dear god man chill im einstein hawking serious...,"[0.088415354, 0.22571889, 0.38455784, -0.03194..."
153,t2_wilcwvo,That's what a fake lib would say [Human blood ...,1.0,That's what a fake lib would say [Human blood ...,0.0,That's what a fake lib would say [Human blood ...,0.0,That's what a fake lib would say [Human blood ...,0.0,thats fake lib would say human blood water url...,"[0.059574068, 0.1017659, 0.38396204, -0.074290..."


In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['extrovert'],test_size=0.2, random_state=42, stratify=data['extrovert'])

In [9]:
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])


In [10]:
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])

In [11]:
lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train_flattened, y_train)


In [12]:
from sklearn.metrics import accuracy_score
print(accuracy_score(lr.predict(X_train_flattened), y_train),
accuracy_score(lr.predict(X_test_flattened), y_test))
# get classification report
from sklearn.metrics import classification_report
print(classification_report(lr.predict(X_test_flattened), y_test))

0.9032258064516129 0.7096774193548387
              precision    recall  f1-score   support

         0.0       0.91      0.75      0.82        28
         1.0       0.12      0.33      0.18         3

    accuracy                           0.71        31
   macro avg       0.52      0.54      0.50        31
weighted avg       0.84      0.71      0.76        31



In [13]:
X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['feeling'],test_size=0.2, random_state=42, stratify=data['feeling'])
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])

lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train_flattened, y_train)

In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(lr.predict(X_train_flattened), y_train),
accuracy_score(lr.predict(X_test_flattened), y_test))
# get classification report
from sklearn.metrics import classification_report
print(classification_report(lr.predict(X_test_flattened), y_test))

0.9516129032258065 0.7419354838709677
              precision    recall  f1-score   support

         0.0       0.91      0.77      0.83        26
         1.0       0.33      0.60      0.43         5

    accuracy                           0.74        31
   macro avg       0.62      0.68      0.63        31
weighted avg       0.82      0.74      0.77        31



In [15]:
X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['judging'],test_size=0.2, random_state=42, stratify=data['judging'])
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])

lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train_flattened, y_train)

In [16]:
from sklearn.metrics import accuracy_score
print(accuracy_score(lr.predict(X_train_flattened), y_train),
accuracy_score(lr.predict(X_test_flattened), y_test))
# get classification report
from sklearn.metrics import classification_report
print(classification_report(lr.predict(X_test_flattened), y_test))

0.9758064516129032 0.5483870967741935
              precision    recall  f1-score   support

         0.0       0.58      0.44      0.50        16
         1.0       0.53      0.67      0.59        15

    accuracy                           0.55        31
   macro avg       0.55      0.55      0.54        31
weighted avg       0.56      0.55      0.54        31



In [17]:
X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['sensing'],test_size=0.2, random_state=42, stratify=data['sensing'])
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])

lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train_flattened, y_train)

In [18]:
from sklearn.metrics import accuracy_score
print(accuracy_score(lr.predict(X_train_flattened), y_train),
accuracy_score(lr.predict(X_test_flattened), y_test))
# get classification report
from sklearn.metrics import classification_report
print(classification_report(lr.predict(X_test_flattened), y_test))

0.9112903225806451 0.8387096774193549
              precision    recall  f1-score   support

         0.0       0.96      0.87      0.91        30
         1.0       0.00      0.00      0.00         1

    accuracy                           0.84        31
   macro avg       0.48      0.43      0.46        31
weighted avg       0.93      0.84      0.88        31



In [84]:
# use grid search to find optimal parameters for logistic regression model for all 4 personality traits

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['extrovert'],test_size=0.2, random_state=42)
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])


# define model
model = LogisticRegression()

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2', 'elasticnet']
space['C'] = [100, 10, 1.0, 0.1, 0.01]

# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

# execute search
result = search.fit(X_train_flattened, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# calculate classification report on test set
from sklearn.metrics import classification_report
print(classification_report(y_test, result.predict(X_test_flattened)))

# save the predictions of the test set in dataframe with column name 'extrovert_pred'
y_pred = result.predict(X_test_flattened)
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.columns = ['extrovert_pred']


Best Score: 0.7420000000000001
Best Hyperparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
              precision    recall  f1-score   support

           0       0.81      1.00      0.89        25
           1       0.00      0.00      0.00         6

    accuracy                           0.81        31
   macro avg       0.40      0.50      0.45        31
weighted avg       0.65      0.81      0.72        31



225 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^

In [85]:
y_pred_df

Unnamed: 0,extrovert_pred
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [86]:
# use grid search to find optimal parameters for logistic regression model for all 4 personality traits

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['feeling'],test_size=0.2, random_state=42)
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])


# define model
model = LogisticRegression()

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2', 'elasticnet']
space['C'] = [100, 10, 1.0, 0.1, 0.01]

# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

# execute search
result = search.fit(X_train_flattened, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# calculate classification report on test set
from sklearn.metrics import classification_report
print(classification_report(y_test, result.predict(X_test_flattened)))

# add predictions from the test to y_pred_df
y_pred_df['feeling_pred'] = result.predict(X_test_flattened)


225 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^

Best Score: 0.7289999999999999
Best Hyperparameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        23
           1       1.00      0.12      0.22         8

    accuracy                           0.77        31
   macro avg       0.88      0.56      0.55        31
weighted avg       0.83      0.77      0.70        31



In [87]:
y_pred_df

Unnamed: 0,extrovert_pred,feeling_pred
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [88]:
# use grid search to find optimal parameters for logistic regression model for all 4 personality traits

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['judging'],test_size=0.2, random_state=42)
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])

print(X_train)

# define model
model = LogisticRegression()

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2', 'elasticnet']
space['C'] = [100, 10, 1.0, 0.1, 0.01]

# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

# execute search
result = search.fit(X_train_flattened, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# calculate classification report on test set
from sklearn.metrics import classification_report
print(classification_report(y_test, result.predict(X_test_flattened)))

# add predictions from the test to y_pred_df
y_pred_df['judging_pred'] = result.predict(X_test_flattened)

96     [-0.10599921, -0.11457217, 0.28883377, -0.0273...
122    [-0.13501509, 0.14990366, 0.32966736, -0.04463...
82     [-0.102551624, 0.121994734, 0.27854675, 0.0558...
109    [-0.014521752, 0.12842986, 0.17662153, -0.0206...
65     [0.24698451, 0.18807735, 0.52520406, -0.126965...
                             ...                        
71     [-0.0008798577, 0.05306895, 0.48110557, 0.0522...
106    [-0.01522696, 0.08446585, 0.34010518, 0.011732...
14     [0.13721104, 0.15530828, 0.31018516, -0.110464...
92     [-0.12656039, 0.13525708, 0.33966416, -0.01012...
102    [0.099014856, 0.15666161, 0.33694896, 0.048100...
Name: bert_embeddings, Length: 124, dtype: object


Best Score: 0.637
Best Hyperparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.48      1.00      0.65        15

    accuracy                           0.48        31
   macro avg       0.24      0.50      0.33        31
weighted avg       0.23      0.48      0.32        31



225 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^

In [89]:
# use grid search to find optimal parameters for logistic regression model for all 4 personality traits

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['sensing'],test_size=0.2, random_state=42)
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])

print(X_train)

# define model
model = LogisticRegression()

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2', 'elasticnet']
space['C'] = [100, 10, 1.0, 0.1, 0.01]

# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

# execute search
result = search.fit(X_train_flattened, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# calculate classification report on test set
from sklearn.metrics import classification_report
print(classification_report(y_test, result.predict(X_test_flattened)))

# add predictions from the test to y_pred_df
y_pred_df['sensing_pred'] = result.predict(X_test_flattened)

96     [-0.10599921, -0.11457217, 0.28883377, -0.0273...
122    [-0.13501509, 0.14990366, 0.32966736, -0.04463...
82     [-0.102551624, 0.121994734, 0.27854675, 0.0558...
109    [-0.014521752, 0.12842986, 0.17662153, -0.0206...
65     [0.24698451, 0.18807735, 0.52520406, -0.126965...
                             ...                        
71     [-0.0008798577, 0.05306895, 0.48110557, 0.0522...
106    [-0.01522696, 0.08446585, 0.34010518, 0.011732...
14     [0.13721104, 0.15530828, 0.31018516, -0.110464...
92     [-0.12656039, 0.13525708, 0.33966416, -0.01012...
102    [0.099014856, 0.15666161, 0.33694896, 0.048100...
Name: bert_embeddings, Length: 124, dtype: object


Best Score: 0.8873333333333335
Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        27
           1       0.00      0.00      0.00         4

    accuracy                           0.87        31
   macro avg       0.44      0.50      0.47        31
weighted avg       0.76      0.87      0.81        31



225 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^

In [90]:
y_pred_df

Unnamed: 0,extrovert_pred,feeling_pred,judging_pred,sensing_pred
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
5,0,0,1,0
6,0,0,1,0
7,0,0,1,0
8,0,0,1,0
9,0,0,1,0


In [91]:
# extrovert, feeling, judging, sensing transform to int
y_pred_df['extrovert_pred'] = y_pred_df['extrovert_pred'].astype(int)
y_pred_df['feeling_pred'] = y_pred_df['feeling_pred'].astype(int)
y_pred_df['judging_pred'] = y_pred_df['judging_pred'].astype(int)
y_pred_df['sensing_pred'] = y_pred_df['sensing_pred'].astype(int)

In [93]:
y_pred_df['personality'] = y_pred_df['extrovert_pred'].astype(str) + y_pred_df['feeling_pred'].astype(str) + y_pred_df['judging_pred'].astype(str) + y_pred_df['sensing_pred'].astype(str)

In [94]:
# extrovert, feeling, judging, sensing transform to int
data['extrovert'] = data['extrovert'].astype(int)
data['feeling'] = data['feeling'].astype(int)
data['judging'] = data['judging'].astype(int)
data['sensing'] = data['sensing'].astype(int)

In [95]:
data['personalty'] = data['extrovert'].astype(str) + data['feeling'].astype(str) + data['judging'].astype(str) + data['sensing'].astype(str)

In [99]:
y_test

81     0
142    1
31     0
29     0
118    0
60     0
93     0
147    0
153    0
68     0
42     0
138    0
78     0
75     0
15     1
19     0
30     0
90     0
117    0
137    0
18     0
12     0
9      0
24     0
69     0
131    1
95     0
45     0
86     0
84     1
126    0
Name: sensing, dtype: int32

In [101]:
X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['personalty'],test_size=0.2, random_state=42)
y_test

81     1000
142    0001
31     0110
29     0000
118    0110
60     0010
93     0010
147    0010
153    1000
68     0010
42     0010
138    0110
78     1000
75     0000
15     0001
19     0010
30     1000
90     1000
117    0110
137    0010
18     0100
12     1000
9      0010
24     0100
69     0010
131    0001
95     0000
45     0110
86     0010
84     0001
126    0100
Name: personalty, dtype: object

In [102]:
# calculate classification report on personality prediction
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_df['personality']))

              precision    recall  f1-score   support

        0000       0.00      0.00      0.00         3
        0001       0.00      0.00      0.00         4
        0010       0.33      1.00      0.50        10
        0100       0.00      0.00      0.00         3
        0110       0.00      0.00      0.00         5
        1000       0.00      0.00      0.00         6

    accuracy                           0.32        31
   macro avg       0.06      0.17      0.08        31
weighted avg       0.11      0.32      0.16        31



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [103]:
# use grid search to find optimal parameters for logistic regression model for all 4 personality traits

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(data['bert_embeddings'], data['personalty'],test_size=0.2, random_state=42)
X_train_flattened = np.array([embedding.flatten() for embedding in X_train])
X_test_flattened = np.array([embedding.flatten() for embedding in X_test])

print(X_train)

# define model
model = LogisticRegression()

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['l2', 'elasticnet']
space['C'] = [100, 10, 1.0, 0.1, 0.01]

# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

# execute search
result = search.fit(X_train_flattened, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# calculate classification report on test set
from sklearn.metrics import classification_report
print(classification_report(y_test, result.predict(X_test_flattened)))


96     [-0.10599921, -0.11457217, 0.28883377, -0.0273...
122    [-0.13501509, 0.14990366, 0.32966736, -0.04463...
82     [-0.102551624, 0.121994734, 0.27854675, 0.0558...
109    [-0.014521752, 0.12842986, 0.17662153, -0.0206...
65     [0.24698451, 0.18807735, 0.52520406, -0.126965...
                             ...                        
71     [-0.0008798577, 0.05306895, 0.48110557, 0.0522...
106    [-0.01522696, 0.08446585, 0.34010518, 0.011732...
14     [0.13721104, 0.15530828, 0.31018516, -0.110464...
92     [-0.12656039, 0.13525708, 0.33966416, -0.01012...
102    [0.099014856, 0.15666161, 0.33694896, 0.048100...
Name: bert_embeddings, Length: 124, dtype: object


225 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dimit\anaconda3\envs\DC3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^

Best Score: 0.37888888888888894
Best Hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
              precision    recall  f1-score   support

        0000       0.00      0.00      0.00         3
        0001       0.00      0.00      0.00         4
        0010       0.31      0.80      0.44        10
        0100       0.00      0.00      0.00         3
        0110       0.00      0.00      0.00         5
        1000       0.00      0.00      0.00         6

    accuracy                           0.26        31
   macro avg       0.05      0.13      0.07        31
weighted avg       0.10      0.26      0.14        31



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
