In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

In [2]:
df = pd.read_csv('Done_mbti.csv')

In [3]:
X = df['posts']
y = df['type']

In [4]:
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
max_words = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [6]:
maxlen = max(len(x) for x in sequences)

In [7]:
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

In [8]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, test_indices = next(splitter.split(padded_sequences, y))
X_train, X_test = padded_sequences[train_indices], padded_sequences[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

In [9]:
model = Sequential()
model.add(Embedding(1000, 128, input_length=maxlen))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(16, activation='softmax'))

In [10]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2648dfe9bd0>

In [11]:
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)



In [12]:
y_true = le.inverse_transform(y_test)
y_pred = le.inverse_transform(y_pred)

In [None]:
print("Accuracy score: ", accuracy_score(y_true, y_pred))
print("F1 score: ", f1_score(y_true, y_pred, average='weighted'))