In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import random
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import os
from keras.models import load_model
import pickle

In [None]:
# Read in the table to pandas dataframe
original_data = pd.read_csv('mbti_1.csv')
original_data.head()

In [None]:
# Change column names
original_data.columns = ['type','posts']
original_data.head()

In [None]:
# Analyze the correlation between different personality types
split_data = original_data[['type']].copy()

split_data['E-I'] = original_data['type'].str.extract('(.)[N,S]',1)
split_data['N-S'] = original_data['type'].str.extract('[E,I](.)[F,T]',1)
split_data['T-F'] = original_data['type'].str.extract('[N,S](.)[J,P]',1)
split_data['J-P'] = original_data['type'].str.extract('[F,T](.)',1)

In [None]:
# Encode letters to numeric values
le = LabelEncoder()

encoded_data = split_data[['type']].copy()
encoded_data['E0-I1'] = le.fit_transform(split_data['E-I'])
encoded_data['N0-S1'] = le.fit_transform(split_data['N-S'])
encoded_data['F0-T1'] = le.fit_transform(split_data['T-F'])
encoded_data['J0-P1'] = le.fit_transform(split_data['J-P'])

corr_data = encoded_data.drop(columns='type')

correlation = corr_data.corr()
correlation.style.background_gradient()

In [None]:
# Analyze the personality type distribution
count_person_types = original_data.groupby('type').agg({'type':'count'})
count_person_types

In [None]:
# Create a bar chart based off of the group series from before
count_person_chart = count_person_types.plot(kind='bar')
count_person_chart.set_xlabel("Personality Types")
count_person_chart.set_ylabel("Number of Samples")

plt.show()
plt.tight_layout()

In [None]:
# Count personality type combination
split_data.groupby('E-I').agg({'E-I':'count'}).plot(kind='bar')
split_data.groupby('N-S').agg({'N-S':'count'}).plot(kind='bar')
split_data.groupby('T-F').agg({'type':'count'}).plot(kind='bar')
split_data.groupby('J-P').agg({'type':'count'}).plot(kind='bar')

In [None]:
# count 'http'
original_data_copy = original_data.copy()
original_data_copy['http_per_post']=original_data['posts'].apply(lambda x: x.count('http')/50)
original_data_copy.head()

# create a new field without http sting
p = "(http.*?\s)"
original_data['no_url']=original_data['posts'].replace(p," ",regex=True)
original_data.head()

In [None]:
# count question marks
original_data_copy['?_per_post']=original_data['no_url'].apply(lambda x: x.count('?')/50)
original_data_copy.head()

In [None]:
# count exclamation marks
original_data_copy['!_per_post']=original_data['no_url'].apply(lambda x: x.count('!')/50)
original_data_copy.head()

In [None]:
# create a column without ||| sting
p = "(\|\|\|)"
original_data['text']=original_data['no_url'].replace(p," ",regex=True)
original_data.head()

In [None]:
# Count the length of each post
original_data_copy['length_per_post'] = original_data['text'].apply(lambda x: len(x)/50)
original_data_copy.head()

In [None]:
# count digits
original_data_copy['digits_per_post'] = original_data['text'].apply(lambda x: sum(c.isdigit() for c in x)/50)
original_data_copy.head()

In [None]:
# Group the counts by type
analysis_group = original_data_copy.groupby('type').mean()
analysis_group.head()

In [None]:
# Plot the counts in a grouped bar chart
analysis_group.plot(kind='bar', subplots=True, title="Writing analysis (per post)",
        layout=(5, 1), sharex=True, sharey=False, legend=False,
              figsize=(8,12),rot=0)

In [None]:
# remove all punctuations
p = "[^\w\s]"
original_data['text']=original_data['text'].replace(p," ",regex=True)
original_data.head()

In [None]:
# remove underscore
p = "\_"
original_data['text']=original_data['text'].replace(p," ",regex=True)
original_data.head()

In [None]:
# remove all numbers
p = "\d+"
original_data['text']=original_data['text'].replace(p," ",regex=True)
original_data.head()

In [None]:
# remove one letter words
p = "\W*\b\w\b"
original_data['text']=original_data['text'].replace(p," ",regex=True)
original_data.head()

In [None]:
# make everything lowercase
original_data['text'] = original_data['text'].str.lower()
original_data.head()

In [None]:
# save the cleaned df
cleaned_data = original_data[['type','text']]
cleaned_data.head()

In [None]:
# Filter rows containing the keyword in any column
filtered_data = cleaned_data[cleaned_data.apply(lambda text: 'ESTJ' in text.values, axis=1)]

In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Tokenize words
word_lists = [word_tokenize(text) for text in filtered_data['text']]

# Flatten the list of lists into a single list
words = [word for word_list in word_lists for word in word_list]

# Create a new dataframe with individual words
word_data = pd.DataFrame({'word': words})

print(word_data)


In [None]:
unique_words = word_data['word'].unique()

print('Number of unique words: ', len(unique_words))

# Choose 7000 random words from the unique words list.
# This is done because there is only 5881 unique words in the dataset and this will help us to generate sentences with some repeated words.
random_words = random.choices(unique_words, k=7000)

# Create sentences of 7000 words each (448 sentences)
num_sentences = 4000
words_per_sentence = 448

sentences = []
for _ in range(num_sentences):
    sentence = ' '.join(random.sample(random_words, words_per_sentence))
    sentences.append(sentence)

# Create a new dataframe with the generated sentences
new_data = {'sentence': sentences}
new_df = pd.DataFrame(new_data)

print(new_df)

In [None]:
# Add the 'type' column with 'ESTJ' value at the beginning
new_df.insert(0, 'type', 'ESTJ')

# Rename the 'sentence' column to 'text'
new_df.rename(columns={'sentence': 'text'}, inplace=True)

print(new_df)

In [None]:
# Concatenate the dataframes along the rows
merged_df = pd.concat([cleaned_data, new_df], ignore_index=True)

print(merged_df)

In [None]:
# Count personality type combination

split_data1 = merged_df[['type']].copy()

split_data1['E-I'] = merged_df['type'].str.extract('(.)[N,S]',1)
split_data1['N-S'] = merged_df['type'].str.extract('[E,I](.)[F,T]',1)
split_data1['T-F'] = merged_df['type'].str.extract('[N,S](.)[J,P]',1)
split_data1['J-P'] = merged_df['type'].str.extract('[F,T](.)',1)

le1 = LabelEncoder()

encoded_data1 = split_data1[['type']].copy()
encoded_data1['E0-I1'] = le1.fit_transform(split_data1['E-I'])
encoded_data1['N0-S1'] = le1.fit_transform(split_data1['N-S'])
encoded_data1['F0-T1'] = le1.fit_transform(split_data1['T-F'])
encoded_data1['J0-P1'] = le1.fit_transform(split_data1['J-P'])

split_data1.groupby('E-I').agg({'E-I':'count'}).plot(kind='bar')
split_data1.groupby('N-S').agg({'N-S':'count'}).plot(kind='bar')
split_data1.groupby('T-F').agg({'type':'count'}).plot(kind='bar')
split_data1.groupby('J-P').agg({'type':'count'}).plot(kind='bar')

In [None]:
# Generate word clouds for each personality type
def generate_word_cloud(posts, personality_type):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(posts)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {personality_type} Personality Type')
    plt.show()

# Group data by personality type
grouped_data = merged_df.groupby('type')

# Generate word clouds for each personality type
for personality_type, group in grouped_data:
    posts_combined = ' '.join(group['text'])
    generate_word_cloud(posts_combined, personality_type)

In [None]:
# Split type columns into four binary columns
split_data = merged_df[['type','text']].copy()
split_data['E-I'] = split_data['type'].str.extract('(.)[N,S]',1)
split_data['N-S'] = split_data['type'].str.extract('[E,I](.)[F,T]',1)
split_data['T-F'] = split_data['type'].str.extract('[N,S](.)[J,P]',1)
split_data['J-P'] = split_data['type'].str.extract('[F,T](.)',1)
split_data.head()

In [None]:
# Encode letters to numeric values

le = LabelEncoder()

encoded_data = merged_df[['type','text']].copy()
encoded_data['E0-I1'] = le.fit_transform(split_data['E-I'])
encoded_data['N0-S1'] = le.fit_transform(split_data['N-S'])
encoded_data['F0-T1'] = le.fit_transform(split_data['T-F'])
encoded_data['J0-P1'] = le.fit_transform(split_data['J-P'])

encoded_data.head()

In [None]:
encoded_data.columns

In [None]:
# Define X and y
X = encoded_data["text"].values
y_all = encoded_data.drop(columns=['type', 'text'])

# Split training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_all_train, y_all_test = train_test_split(X, y_all, random_state=42)

In [None]:
# Define TFIDF verctorizer
vectorizer = TfidfVectorizer(
    max_features=17000,
    min_df=7,
    max_df=0.8,
    stop_words="english",
    ngram_range=(1,3),
)

In [None]:
# create vectors for X
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Assuming you've already transformed your text data into TF-IDF vectors
# X_train, X_test, y_all_train, y_all_test

# Define the combinations
combinations = ['E0-I1', 'N0-S1', 'F0-T1', 'J0-P1']

# Create a dictionary to hold models
models = {}

# Loop through combinations
for combination in combinations:
    y_combination_train = y_all_train[combination]
    y_combination_test = y_all_test[combination]

    # Create the RNN model
    model = Sequential()
    model.add(Embedding(input_dim=len(vectorizer.get_feature_names_out()), output_dim=128))
    model.add(LSTM(64, return_sequences=True))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train.toarray(), y_combination_train, epochs=3, batch_size=32, validation_split=0.2)

    # Store the model
    models[combination] = model

# Evaluate models if needed
for combination, model in models.items():
    loss, accuracy = model.evaluate(X_test.toarray(), y_all_test[combination])
    print(f"Combination: {combination} - Test Loss: {loss} - Test Accuracy: {accuracy}")

In [None]:
# Create the "model" folder if it doesn't exist
if not os.path.exists('model'):
    os.makedirs('model')

# Save each model in the "model" folder
for combination, model in models.items():
    model.save(f'model/{combination}.h5')

# Save the vectorizer in the "model" folder
with open('model/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)