# Big 5 personality assessment

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, precision_score, multilabel_confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from keras import models, layers, optimizers, losses, callbacks
nltk.download('stopwords')
import gensim
from keras.models import load_model
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Replace path with current path of embeddings
gm = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin', binary=True)

gm.save_word2vec_format('w2vg.txt', binary=False)

In [5]:
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/AyishaR0/essays.csv', encoding='cp1252', error_bad_lines=False, names = ['#AUTHID','TEXT','cEXT','cNEU','cAGR','cCON','cOPN'], skiprows=[0])

In [6]:
targets = ['cEXT','cNEU','cAGR','cCON','cOPN']
for c in targets:
    df[c] = (df[c] == 'y').astype('int64')

df = df.drop(columns = ['#AUTHID'])

df

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,I can't believe it! It's really happening! M...,1,0,1,1,0
4,"Well, here I go with the good old stream of co...",1,0,1,0,1
...,...,...,...,...,...,...
2462,I'm home. wanted to go to bed but remembe...,0,1,0,1,0
2463,Stream of consiousnesssskdj. How do you s...,1,1,0,0,1
2464,"It is Wednesday, December 8th and a lot has be...",0,0,1,0,0
2465,"Man this week has been hellish. Anyways, now i...",0,1,0,0,1


In [None]:
for c in targets:
    print(c, '-', list(df[c]).count(1))
    print()

In [8]:
# Remove html tags
def removeHTML(sentence):
    regex = re.compile('<.*?>')
    return re.sub(regex, ' ', sentence)

# Remove URLs
def removeURL(sentence):
    regex = re.compile('http[s]?://\S+')
    return re.sub(regex, ' ', sentence)

# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
    regex = re.compile('[^a-zA-Z]')
    return re.sub(regex, ' ', sentence)

stop = nltk.corpus.stopwords.words('english')

In [9]:
sno = nltk.stem.SnowballStemmer('english')    # Initializing stemmer
all_sentences = []    # All cleaned sentences


for x in range(len(df['TEXT'].values)):
    question = df['TEXT'].values[x]
    classname = df[targets].values[x]

    cleaned_sentence = []
    sentence = removeURL(question) 
    sentence = removeHTML(sentence)
    sentence = onlyAlphabets(sentence)
    sentence = sentence.lower()   

    for word in sentence.split():
        if word not in stop:
            stemmed = sno.stem(word)
            cleaned_sentence.append(stemmed)
    all_sentences.append(' '.join(cleaned_sentence))

# add as column in dataframe
df['Cleaned'] = all_sentences

In [None]:
df.isna().sum()

In [11]:
X = df['Cleaned']
y = df[targets]

split = int(0.9*len(df))
 
Xtrain, Xtest = X[:split], X[split:]
ytrain, ytest = y[:split], y[split:]
 
print("Train set - ", Xtrain.shape[0])
print("Test set - ", Xtest.shape[0])

Train set -  2220
Test set -  247


In [12]:
# Tokenization
vocab = 50000
 
tokenizer = Tokenizer(num_words = vocab, oov_token = '<UNK>')
tokenizer.fit_on_texts(Xtrain)
word_index = tokenizer.word_index
 
# Padding
mlen = 1000
padding_type = 'post'
trunc_type = 'post'
 
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtrain = pad_sequences(Xtrain, maxlen=mlen, padding=padding_type, truncating=trunc_type)

Xtest = tokenizer.texts_to_sequences(Xtest)
Xtest = pad_sequences(Xtest, maxlen=mlen, padding=padding_type, truncating=trunc_type)

In [13]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
EMB = '/content/w2vg.txt'
EMBEDDING_DIM = 300

embeddings_index = {}
f = open(EMB, 'r')
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' %len(embeddings_index))

Found 3000000 word vectors.


In [15]:
embedding_matrix = np.zeros((vocab, EMBEDDING_DIM))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [16]:
# Build and train neural network
 
model = models.Sequential([
    layers.Embedding(vocab, EMBEDDING_DIM, weights=[embedding_matrix], input_length = mlen, trainable = False),
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation = 'relu'),
    layers.Dense(64, activation = 'relu'),
    layers.Dense(32, activation = 'relu'),
    layers.Dense(len(targets), activation = 'sigmoid')
])

#cb = [callbacks.EarlyStopping(patience = 5, restore_best_weights = True)]
cb = [callbacks.ModelCheckpoint('big5g.h5', save_best_only=True, monitor='val_accuracy')]

In [None]:
model.compile(optimizer = optimizers.Adam(0.001), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xtest, ytest), epochs = 256, callbacks=cb)

In [18]:
model = load_model('big5g.h5')

In [None]:
ypred = model.predict(Xtest)
ypred = (ypred>0.5).astype('int')

cm = multilabel_confusion_matrix(ytest, ypred)

plt.figure(figsize=(40,40))

for k in range(cm.shape[0]):
    cmi = cm[k].astype('float') / cm[k].sum(axis=1)[:, np.newaxis]

    ax = plt.subplot(len(targets), 1, k+1)

    for i in range(cmi.shape[1]):
        for j in range(cmi.shape[0]):
            plt.text(j, i, format(cmi[i, j], '.2f'), horizontalalignment="center", color="black")

    plt.title(targets[k])
    plt.imshow(cmi, cmap=plt.cm.Blues)


In [20]:
scores = # predict using model

big5_targets = [['outgoing/energetic', 'solitary/reserved', 'Extraversion'],
                ['sensitive/nervous', 'resilient/confident', 'Neuroticism'],
                ['friendly/compassionate', 'challenging/callous', 'Agreeableness'],
                ['efficient/organized', 'extravagant/careless', 'Conscientiousness'],
                ['inventive/curious', 'consistent/cautious', 'Openness to experience']]

for i in range(len(scores)):
    print(big5_targets[i][2], ':', big5_targets[i][int(scores[i] > 0.5)])

SyntaxError: ignored