In [1]:
import torch as torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import pandas as pd
import re
import random

In [2]:
data = pd.read_csv('mbti_preprocessed.csv')

all_characters = "abcdefghijklmnopqrstuvwxyz0123456789 ,.!?:()"
all_personality = ['INTJ', 'INTP', 'INFJ', 'INFP',
                   'ISTJ', 'ISTP', 'ISFJ', 'ISFP',
                   'ENTJ', 'ENTP', 'ENFJ', 'ENFP',
                   'ESTJ', 'ESTP', 'ESFJ', 'ESFP']

def clean_text(text):
    return re.sub("[^a-z0-9 ,\\.!?:()]", "", text.lower()) 
data['post'] = data['post'].apply(clean_text)

def filter_row(row):
    if row['post'].strip() == '':
        return False
    if all(x.isdigit() for x in row['post'].strip().split()):
        return False
    return True

data = data[data.apply(filter_row, axis=1)]
df = pd.DataFrame(data)
df

Unnamed: 0.1,Unnamed: 0,post,type
0,0,enfp and intj moments sportscenter not top ...,INFJ
1,1,what has been the most lifechanging experience...,INFJ
2,2,on repeat for most of today,INFJ
3,3,may the perc experience immerse you,INFJ
4,4,the last thing my infj friend posted on his fa...,INFJ
5,5,hello enfj7 sorry to hear of your distress its...,INFJ
7,7,welcome and stuff,INFJ
8,8,game set match,INFJ
9,9,prozac wellbrutin at least thirty minutes of m...,INFJ
10,10,basically come up with three items youve deter...,INFJ


In [3]:

def character_to_index(letter):
    ret = all_characters.find(letter)
    if ret == -1:
        raise Exception("Character not found {}".format(letter))
    return ret

def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, len(all_characters)).cuda()
    for idx, character in enumerate(line):
        tensor[idx][0][character_to_index(character)] = 1
    return tensor

def personality_to_index(personality):
    return all_personality.index(personality)

def personality_to_tensor(personality):
    return torch.LongTensor([personality_to_index(personality)]).cuda()


In [67]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
#         self.hidden_size = hidden_size
        
#         self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
#         self.i2o = nn.Linear(input_size + hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)
    
        self.l1 = nn.RNN(len(all_characters), 128, 1, nonlinearity='relu')
        self.l2 = nn.Linear(128, len(all_personality))
        self.softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, input, hidden):
#         combined = torch.cat([input, hidden], 1)
#         hidden = self.i2h(combined)
#         output = self.i2o(combined)
#         output = self.softmax(output)
#         return output, hidden
        output, hidden = self.l1(input, hidden)
        output = self.l2(output)
        output = self.softmax(output)
        return output.view(1,-1), hidden
    
    def init_hidden(self):
        #return Variable(torch.zeros(1, self.hidden_size).cuda())
        return Variable(torch.zeros(1, 1, 128).cuda()) #layers, batch, hidden_size
    


In [None]:
rnn = RNN(len(all_characters), 512, len(all_personality)).cuda()
criterion = nn.NLLLoss()
learning_rate = 0.005

def train(X, Y):
    hidden = rnn.init_hidden()
    rnn.zero_grad()
    
    for i in range(X.size()[0]):
        y_hat, hidden = rnn(X[i].unsqueeze(0), hidden)
    
    loss = criterion(y_hat, Y)
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    
    return y_hat, loss.data[0]

iter = 0
n_iters = 100000
all_loss = 0
print_every = 1000
correct_count = 0

for iter in range(n_iters):
    sample = data.sample(n=1)
    X = Variable(line_to_tensor(sample['post'].iloc[0])).cuda()
    Y = Variable(personality_to_tensor(sample['type'].iloc[0])).cuda()
    
    predicted, loss = train(X, Y)
    all_loss += loss
    maxval, argmax = predicted.max(1)
    
    is_correct = all_personality.index(sample['type'].iloc[0]) == argmax.data[0]
    if is_correct:
        correct_count += 1
    
    if iter % print_every == 0:
        print("{}, {} avg: {} ({} / {}) ({} of {} correct)".format(iter, loss, all_loss / print_every, all_personality[argmax.data[0]], sample['type'].iloc[0], correct_count, print_every))
        all_loss = 0
        correct_count = 0
    
    

0, 2.749635934829712 avg: 0.0027496359348297117 (ESTJ / INTJ) (0 of 1000 correct)
1000, 2.6844446659088135 avg: 2.4632148123979567 (INTP / ENTP) (174 of 1000 correct)
2000, 1.87796950340271 avg: 2.3077777811288835 (INFP / INFJ) (175 of 1000 correct)
3000, 1.5894948244094849 avg: 2.3308941099643707 (INFP / INFP) (208 of 1000 correct)
4000, 2.0219173431396484 avg: 2.276189523935318 (INFP / INTP) (209 of 1000 correct)
5000, 1.8985515832901 avg: 2.3148885898590086 (INFP / INFJ) (187 of 1000 correct)
6000, 2.5320615768432617 avg: 2.291142550230026 (INFP / ENTP) (178 of 1000 correct)
7000, 2.4020838737487793 avg: 2.2913177757263186 (INFP / ENFP) (174 of 1000 correct)
8000, 1.547767162322998 avg: 2.285346109986305 (INFP / INFP) (216 of 1000 correct)
9000, 1.7563728094100952 avg: 2.295362334728241 (INFP / INFJ) (195 of 1000 correct)
10000, 1.6454198360443115 avg: 2.3180390019416808 (INFJ / INFJ) (205 of 1000 correct)
11000, 1.7058475017547607 avg: 2.3048809374570847 (INFP / INFJ) (198 of 1000 