In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import spacy
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df = pd.read_csv('data/MBTI500.csv')

In [3]:
print(df.type.value_counts())
lengths = df['posts'].apply(lambda x: len(x.split(' ')))
lengths.describe()

type
INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: count, dtype: int64


count    106067.000000
mean        500.019818
std           1.209997
min         479.000000
25%         500.000000
50%         500.000000
75%         500.000000
max         527.000000
Name: posts, dtype: float64

In [4]:
from sklearn.utils import resample

INTP = resample(df[df.type == 'INTP'], replace=False, n_samples=181, random_state=4)
INTJ = resample(df[df.type == 'INTJ'], replace=False, n_samples=181, random_state=4)
INFJ = resample(df[df.type == 'INFJ'], replace=False, n_samples=181, random_state=4)
INFP = resample(df[df.type == 'INFP'], replace=False, n_samples=181, random_state=4)
ENTP = resample(df[df.type == 'ENTP'], replace=False, n_samples=181, random_state=4)
ENFP = resample(df[df.type == 'ENFP'], replace=False, n_samples=181, random_state=4)
ISTP = resample(df[df.type == 'ISTP'], replace=False, n_samples=181, random_state=4)
ENTJ = resample(df[df.type == 'ENTJ'], replace=False, n_samples=181, random_state=4)
ESTP = resample(df[df.type == 'ESTP'], replace=False, n_samples=181, random_state=4)
ENFJ = resample(df[df.type == 'ENFJ'], replace=False, n_samples=181, random_state=4)
ISTJ = resample(df[df.type == 'ISTJ'], replace=False, n_samples=181, random_state=4)
ISFP = resample(df[df.type == 'ISFP'], replace=False, n_samples=181, random_state=4)
ISFJ = resample(df[df.type == 'ISFJ'], replace=False, n_samples=181, random_state=4)
ESTJ = resample(df[df.type == 'ESTJ'], replace=False, n_samples=181, random_state=4)
ESFP = resample(df[df.type == 'ESFP'], replace=False, n_samples=181, random_state=4)
ESFJ = df[df.type == 'ESFJ']

uniform_df = pd.concat([INTP, INTJ, INFJ, INFP, ENTP, ENFP, ISTP, ENTJ, ESTP, ENFJ, ISTJ, ISFP, ISFJ, ESTJ, ESFP, ESFJ])

In [5]:
print(uniform_df.type.value_counts())

type
INTP    181
INTJ    181
INFJ    181
INFP    181
ENTP    181
ENFP    181
ISTP    181
ENTJ    181
ESTP    181
ENFJ    181
ISTJ    181
ISFP    181
ISFJ    181
ESTJ    181
ESFP    181
ESFJ    181
Name: count, dtype: int64


In [6]:
uniform_df.to_csv('data/UniformMBTI.csv', index=False)

In [7]:
ds = np.array(uniform_df)

def tokenize_features(dataset, num_features):
    tokens = [text.split(' ') for text in dataset[:, 0]]
    features = np.zeros((len(tokens), num_features), dtype=int)

    return features

X = tokenize_features(ds, 527)
X.shape

(2896, 527)

In [8]:
def encode_classes(dataset):
    types = ['INTP', 'INTJ', 'INFJ', 'INFP', 'ENTP', 'ENFP', 'ISTP', 'ENTJ',
             'ESTP', 'ENFJ', 'ISTJ', 'ISFP', 'ISFJ', 'ESTJ', 'ESFP', 'ESFJ']
    
    labels = np.array([[1 if types[i] == etiqueta else 0 for i in range(16)] for etiqueta in dataset[:, 1]])
    return labels

Y = encode_classes(ds)
Y.shape

(2896, 16)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, test_size=0.3, random_state=4)
X_test, X_val = X_test_val[:int(len(X_test_val) * 0.5)], X_test_val[int(len(X_test_val) * 0.5):]
Y_test, Y_val = Y_test_val[:int(len(Y_test_val) * 0.5)], Y_test_val[int(len(Y_test_val) * 0.5):]

print("Train set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Test set:", X_test.shape)


Train set: (2027, 527)
Validation set: (435, 527)
Test set: (434, 527)


In [10]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train))
val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(Y_val))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(Y_test))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [11]:
nlp = spacy.load('en_core_web_md')

In [12]:
class MBTIClassifier(nn.Module):
    def __init__(self, embed_model, num_filters, kernel_sizes=[3, 4, 5],  drop_prob=0.5):
        super(MBTIClassifier, self).__init__()

        self.named_buffers = num_filters
        self.embedding = nn.Embedding(embed_model.vectors.shape[0], embed_model.vectors.shape[1])
        self.embedding.weight = nn.Parameter(torch.from_numpy(embed_model.vectors.data))
        self.convs_1d = nn.ModuleList([
            nn.Conv2d(1, num_filters, (i, embed_model.vectors.shape[1]), padding=(i-2,0))
            for i in kernel_sizes])
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, 16)
        self.dropout = nn.Dropout(drop_prob)
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        embeds = self.embedding(X)
        embeds = embeds.unsqueeze(1)

        convs = []

        for conv in self.convs_1d:
            x = F.relu(conv(embeds)).squeeze(3)
            x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
            convs.append(x_max)

        x = torch.cat(convs, 1)
        x = self.dropout(x)
        x = self.fc(x)

        return self.logsoftmax(x)

In [13]:
import time
import math

def timeSince(since):
    '''
    Función auxiliar, nos servirá para monitorizar el tiempo que hemos entrenado.
    '''
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [46]:
def train(net, train_loader, criterion, optimizer):
    if torch.cuda.is_available():
        net.cuda()

    loss = torch.Tensor([0])
    net.train()
    for x, y in train_loader:
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
        net.zero_grad()
        output = net(x)
        l = criterion(output.squeeze(), y.float())
        optimizer.step()
        loss += l

    loss.backward()

    net.eval()
    for x, y in val_loader:
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
        
        output = net(x)
        val_loss = criterion(output.squeeze(), y.float())
    
    return loss.item()/len(train_loader.sampler), val_loss.item()

In [49]:
clasificador = MBTIClassifier(nlp.vocab, 20)
print(clasificador)

MBTIClassifier(
  (embedding): Embedding(20000, 300)
  (convs_1d): ModuleList(
    (0): Conv2d(1, 20, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
    (1): Conv2d(1, 20, kernel_size=(4, 300), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 20, kernel_size=(5, 300), stride=(1, 1), padding=(3, 0))
  )
  (fc): Linear(in_features=60, out_features=16, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (logsoftmax): LogSoftmax(dim=1)
)


In [None]:
n_iters = 50
print_every = 5
plot_every = 5
all_losses = []
all_val_losses = []
total_loss = 0
total_val_loss = 0

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(clasificador.parameters(), lr=0.01)

start = time.time()

for iter in range(1, n_iters + 1):
    loss, val_loss = train(clasificador, train_loader, criterion, optimizer)
    total_loss += loss
    total_val_loss += val_loss

    if iter % print_every == 0:
        print('%s (%d %d%%)' % (timeSince(start), iter, iter / n_iters * 100),
              'Loss:{:.4f}'.format(loss), 'Val_loss:{:.4f}'.format(val_loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        all_val_losses.append(total_val_loss / plot_every)
        total_loss = 0
        total_val_loss = 0

plt.plot(all_losses, label='Train')
plt.plot(all_val_losses, label='Validation')
plt.title('Curva de Error')
plt.xlabel('Iteraciones')
plt.ylabel('Error')
plt.show()