In [1]:
import torch
from torch import nn

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
import pandas as pd

train_data = pd.read_csv("./train_eng.csv", header=0, engine='python' ,encoding = "latin-1", usecols=["Name","Gender"])
test_data = pd.read_csv("./test_eng.csv", header=0, engine='python' ,encoding = "latin-1", usecols=["Name","Gender"])

test_data['Gender'] = test_data['Gender'].apply(lambda x: 0 if x=='M' else 1)
train_data['Gender'] = train_data['Gender'].apply(lambda x: 0 if x=='M' else 1)


#print(train_data.shape)
#train_data = train_data[:1280]
#test_data = test_data[:1280]
#print(train_data.shape)

train_data = train_data.sort_values(by="Name", key=lambda x: x.str.len())
test_data = test_data.sort_values(by="Name", key=lambda x: x.str.len())

max_length_test = len(test_data.iloc[-1]['Name'])
max_length_train = len(train_data.iloc[-1]['Name'])
max_length = max(max_length_test, max_length_train)

unique = list(set("".join(train_data.iloc[:,0])))
unique.sort()
vocab = dict(zip(unique, range(1,len(unique)+1)))

tokenizer = get_tokenizer('basic_english')

vocab_new = Vocab(vocab,specials=())

def data_process(raw_text_iter,max_len=128):
  batch = []
  for item in raw_text_iter:
    res = []
    for i in range(max_len):
      if (len(item)>i):
        res.extend([vocab_new[token] for token in tokenizer(item[i])])
      else:
        res.extend([0])
    batch.append(res)
  pad_data = torch.FloatTensor(batch)
  return pad_data


In [3]:
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
scaler = MinMaxScaler(feature_range=(-1, 1))

max_len = 64
embedding_size = max(max_length_train, max_length_test)
n_classes = len(np.unique(train_data.Gender.values))

train_tensor = data_process(train_data.Name.values, embedding_size)
train_data_normalized = torch.FloatTensor(scaler.fit_transform(train_tensor))
tgts_tensor = torch.nn.functional.one_hot(torch.from_numpy(train_data.Gender.values), n_classes) #torch.from_numpy(train_data.Target.values)

dataset = TensorDataset(train_data_normalized, tgts_tensor)

loader = DataLoader(dataset, batch_size=128, shuffle=True, pin_memory=True)

tensor([[ 0.9200,  0.5200, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [ 0.9200,  0.6800, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [-0.1200,  0.8400, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        ...,
        [ 0.8400,  0.4400, -0.3600,  ...,  0.3600,  0.8400,  0.4400],
        [ 0.2000,  0.6800, -0.6800,  ..., -0.2000,  0.4400,  0.6800],
        [ 0.2800, -0.1200, -0.0400,  ...,  1.0000,  0.6800,  0.1200]])
tensor([[ 0.9200,  0.5200, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [ 0.9200,  0.6800, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [-0.1200,  0.8400, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        ...,
        [ 0.8400,  0.4400, -0.3600,  ...,  0.3600,  0.8400,  0.4400],
        [ 0.2000,  0.6800, -0.6800,  ..., -0.2000,  0.4400,  0.6800],
        [ 0.2800, -0.1200, -0.0400,  ...,  1.0000,  0.6800,  0.1200]])

tensor([[0, 1],
        [1, 0],
        [1, 0],
        ...,
        [1, 0],
        [1, 0],
        [1, 0]])

tensor([[ 0.9200,  0



In [4]:
class LSTM(torch.nn.Module):
  def __init__(self, input_size=max_length, hidden_layer_size=100, output_size=2):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size)
        self.linear = nn.Linear(hidden_layer_size, output_size)
        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size),
                            torch.zeros(1,1,self.hidden_layer_size))

  def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions

model = LSTM()
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [5]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [9]:
epochs = 50
import time 
import torch.nn.functional as F
start_time = time.time()

for i in range(epochs):
    correct = 0
    total = 0
    start_epoch = time.time()
    for item in loader:
        seq = item[0]
        label = item[1]
        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size),
                        torch.zeros(1, 1, model.hidden_layer_size))

        y_pred = model(seq)
        for j in range(y_pred.shape[0]):
          predicted = np.argmax(F.softmax(y_pred[j].data))
          true = np.argmax(label[j].data)
          if (predicted == true):
            correct+=1
          total+=1
        single_loss = loss_function(y_pred, label.float())
        single_loss.backward()
        optimizer.step()

    time_per_epoch = time.time() - start_epoch
    f_measure = f1_loss(label, y_pred)
    writer.add_scalar("train_loss", single_loss.item(), i)
    writer.add_scalar("train_acc", (100 * correct / total), i)
    writer.add_scalar("train_measure", f_measure, i)
    writer.add_scalar("train_time", time_per_epoch, i)
    for tag, parm in model.named_parameters():
      writer.add_histogram(tag, parm.grad.data.cpu().numpy(), i)
    print(f'epoch: {i:3} loss: {single_loss.item():10.8f}, accuracy: {(100 * correct / total)}, f-measure: {f_measure}, time = {time_per_epoch}')

print("MODEL TIME EXECUTION--- %s seconds ---" % (time.time() - start_time))
print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')



epoch:   0 loss: 0.22744991, accuracy: 63.240802996830276, f-measure: 0.5273582339286804, time = 38.57597589492798
epoch:   1 loss: 0.22572528, accuracy: 65.60608971280377, f-measure: 0.5306821465492249, time = 28.163079977035522
epoch:   2 loss: 0.17508495, accuracy: 68.01459994236865, f-measure: 0.5591893792152405, time = 27.99694514274597
epoch:   3 loss: 0.18287085, accuracy: 69.07837863797906, f-measure: 0.5567010641098022, time = 27.788639545440674
epoch:   4 loss: 0.19746009, accuracy: 69.56584381903755, f-measure: 0.5436967611312866, time = 28.022472143173218
epoch:   5 loss: 0.18945123, accuracy: 69.74714244549034, f-measure: 0.5626565217971802, time = 28.18270182609558
epoch:   6 loss: 0.20498942, accuracy: 69.93564499087503, f-measure: 0.5506161451339722, time = 28.06424880027771
epoch:   7 loss: 0.17943515, accuracy: 70.20459129766593, f-measure: 0.5645135045051575, time = 28.614763736724854
epoch:   8 loss: 0.19083315, accuracy: 70.36427816732302, f-measure: 0.551654219627

In [8]:
def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    all = y_pred.shape[0]
    sum=0
    for i in range(all):
      sum+=f1_loss_one(y_true[i], y_pred[i])
    return sum/all

def f1_loss_one(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    y_pred_soft = (F.softmax(((y_pred)), dim=0)).detach()
    if y_pred_soft.ndim == 2:
        y_pred_soft = y_pred_soft.argmax(dim=1)
    if y_true.ndim == 2:
        y_true = y_true.argmax(dim=1)
    
    tp = (y_true * y_pred_soft).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred_soft)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred_soft).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred_soft)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    #f1.requires_grad = is_training
    return f1

In [10]:
print(test_data)
scaler = MinMaxScaler(feature_range=(-1, 1))

max_len = 64
embedding_size = max(max_length_train, max_length_test)
n_classes = len(np.unique(test_data.Gender.values))

test_tensor = data_process(test_data.Name.values, embedding_size)
test_data_normalized = torch.FloatTensor(scaler.fit_transform(test_tensor))
test_tgts_tensor = torch.nn.functional.one_hot(torch.from_numpy(test_data.Gender.values), n_classes) #torch.from_numpy(train_data.Target.values)

test_dataset = TensorDataset(test_data_normalized, test_tgts_tensor)

loader = DataLoader(dataset, batch_size=128, shuffle=True, pin_memory=True)

                  Name  Gender
20199               Ar       0
2786                Lc       0
3902                Si       0
8554                Ji       0
19970               Za       0
...                ...     ...
14096  Christiananthon       0
18070  Christianalexan       0
16548  Matthewalexande       0
10920  Ashleyelizabeth       1
11332  Christopherryan       0

[20822 rows x 2 columns]
tensor([[ 1.0000, -0.3600, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [ 0.1200,  0.8400, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [-0.4400,  0.3600, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        ...,
        [ 0.0400,  1.0000, -0.5200,  ..., -0.0400,  0.7600,  1.0000],
        [ 1.0000, -0.4400,  0.4400,  ...,  0.6800, -0.5200,  0.7143],
        [ 0.8400,  0.4400, -0.3600,  ..., -0.9200,  1.0000,  0.1429]])
tensor([[ 1.0000, -0.3600, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [ 0.1200,  0.8400, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [-0.4400,  0.3600, -

In [11]:
torch.save(model.state_dict(), 'model.pt')

In [12]:
def evaluate_model(model, data_batches, loss_function):
  eval_loss = 0
  eval_acc = 0
  
  model.eval()
  
  with torch.no_grad():
    for batch in data_batches:
      correct = 0
      total = 0
      predictions = model(batch[0]).squeeze(1)
      for j in range(predictions.shape[0]):
          predicted = np.argmax(F.softmax(predictions[j].data))
          true = np.argmax(batch[1][j].data)
          if (predicted == true):
            correct+=1
          total+=1
      loss = loss_function(predictions, batch[1])
      eval_loss += loss.item()
      eval_acc += (correct / total)
  
  return eval_loss / len(data_batches), eval_acc / len(data_batches)

In [13]:
model.load_state_dict(torch.load('model.pt'))
test_loss, test_acc = evaluate_model(model, loader, loss_function)
print(f'Accuracy on test data : {test_acc*100:.2f}%')

  del sys.path[0]


Accuracy on test data : 73.29%
