# Getting Started

In [1]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import tensorflow as tf

In [2]:
data = pd.read_csv('mbti_1.csv')

In [3]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


# Preprocessing

In [5]:
data['type'].unique()

array(['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'],
      dtype=object)

## Classes 
> ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP','ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']

In [6]:
def preprocess_inputs(df):
    
    texts = df['posts'].copy()
    labels = df['type'].copy()
    
    # Process text data
    stop_words = stopwords.words('english')
    
    texts = [text.lower() for text in texts]
    texts = [text.split() for text in texts]
    texts = [[word.strip() for word in text] for text in texts]
    texts = [[word for word in text if word not in stop_words] for text in texts]
    
    vocab_length = 10000
    
    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(texts)
    
    texts = tokenizer.texts_to_sequences(texts)
    
    max_seq_length = np.max([len(text) for text in texts])
    
    texts = pad_sequences(texts, maxlen=max_seq_length, padding='post')
    
    # Process label data
    label_values = [
        'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
    ]
    
    label_mapping = {label: np.int(label[0] == 'E') for label in label_values}
    
    labels = labels.replace(label_mapping)
    labels = np.array(labels)
    
    return texts, labels, max_seq_length, vocab_length, label_mapping

In [7]:
texts, labels, max_seq_length, vocab_length, label_mapping = preprocess_inputs(data)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  label_mapping = {label: np.int(label[0] == 'E') for label in label_values}


In [19]:
print(label_mapping)

{'INFJ': 0, 'ENTP': 1, 'INTP': 0, 'INTJ': 0, 'ENTJ': 1, 'ENFJ': 1, 'INFP': 0, 'ENFP': 1, 'ISFP': 0, 'ISTP': 0, 'ISFJ': 0, 'ISTJ': 0, 'ESTP': 1, 'ESFP': 1, 'ESTJ': 1, 'ESFJ': 1}


In [8]:
print("Text sequences:\n", texts.shape)
print("\nLabels:\n", labels.shape)
print("\nMax sequence length:\n", max_seq_length)
print("\nVocab length:\n", vocab_length)
print("\nLabel mapping:\n", label_mapping)

Text sequences:
 (8675, 859)

Labels:
 (8675,)

Max sequence length:
 859

Vocab length:
 10000

Label mapping:
 {'INFJ': 0, 'ENTP': 1, 'INTP': 0, 'INTJ': 0, 'ENTJ': 1, 'ENFJ': 1, 'INFP': 0, 'ENFP': 1, 'ISFP': 0, 'ISTP': 0, 'ISFJ': 0, 'ISTJ': 0, 'ESTP': 1, 'ESFP': 1, 'ESTJ': 1, 'ESFJ': 1}


In [9]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, train_size=0.7, random_state=123)

In [10]:
texts

array([[  91, 1537,  587, ...,    0,    0,    0],
       [2394,  583,  429, ...,    0,    0,    0],
       [   9, 1085,   18, ...,    0,    0,    0],
       ...,
       [7055,   46,  415, ...,    0,    0,    0],
       [ 518, 9956,   54, ...,    0,    0,    0],
       [6735,   82,   80, ...,    0,    0,    0]], dtype=int32)

# Training

In [None]:
embedding_dim = 512

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length
)(inputs)

gru = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(
        units=256,
        return_sequences=True
    )
)(embedding)

flatten = tf.keras.layers.Flatten()(gru)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)


model = tf.keras.Model(inputs, outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


history = model.fit(
    texts_train,
    labels_train,
    validation_split=0.2,
    batch_size=32,
    epochs=5,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint('./model_personality.h5', save_best_only=True, save_weights_only=True)
    ]
)

# Results

In [17]:
model.load_weights('./model_personality.h5')

In [18]:
model.evaluate(texts_test, labels_test)



[0.4262876808643341, 0.8202074766159058, 0.8088368773460388]

In [62]:
def predict_personality(text):
    import random
    test_text = [text]
    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(test_text)
    test_seq = tokenizer.texts_to_sequences(test_text)
    padded_test = pad_sequences(test_seq,maxlen=max_seq_length, padding='post')

    ar_test = np.array(padded_test)

    preds = model.predict(ar_test)

    
    THRESHOLD = 0.5
    val = -1
    if preds.item() < THRESHOLD:
        val = 0
    else:
        val = 1

    all_keys = list(label_mapping.keys())
    outputs = []
    for i in all_keys:
        if label_mapping[i] == val:
            outputs.append(i)

    output_string = str(outputs[random.randint(0,len(outputs)-1)])

    final_outs = {"I":"Introversion","N":"Intuition","T":"Thinking","J":"Jugding","E":"Extroversion","S":"Sensing","F":"Feeling","P":"Perceiving"}

    results = []
    for letter in output_string:
        print(final_outs[letter])
        results.append(final_outs[letter])
    return results,preds.item(),val


> Just write your text in that function

In [63]:
predict_personality("Hello I just want to die")

Introversion
Sensing
Feeling
Perceiving


(['Introversion', 'Sensing', 'Feeling', 'Perceiving'], 0.010547921061515808, 0)

```
Introversion (I) – Extroversion (E)
Intuition (N) – Sensing (S)
Thinking (T) – Feeling (F)
Judging (J) – Perceiving (P)
```

```
{'INFJ': 0, 'ENTP': 1, 'INTP': 0, 'INTJ': 0, 'ENTJ': 1, 'ENFJ': 1, 'INFP': 0, 'ENFP': 1, 'ISFP': 0, 'ISTP': 0, 'ISFJ': 0, 'ISTJ': 0, 'ESTP': 1, 'ESFP': 1, 'ESTJ': 1, 'ESFJ': 1}
```