In [None]:
import os
for dirname, _, filenames in os.walk('drive/My Drive/Colab Notebooks'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import sys
import numpy as np
import random as rn
import pandas as pd
import torch
from torch import nn
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

! pip install transformers
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AdamW

In [4]:
rn.seed(1)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)

In [5]:
path = 'drive/My Drive/HT/Data/'
train_data = pd.read_csv(path + 'train_data.csv')
test_data = pd.read_csv(path + 'test_data.csv')

In [6]:
train_data = train_data#[:2000]
test_data = test_data#[:500]

train_data = train_data.to_dict(orient='records')
test_data = test_data.to_dict(orient='records')
type(train_data)

list

In [7]:
train_texts, train_labels = list(zip(*map(lambda d: (d['Sentence'], torch.argmax(torch.from_numpy(np.array([d['Happy'], d['Sad'], d['Anger'], d['Surprise'], d['Disgust'], d['Fear']])))), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['Sentence'], torch.argmax(torch.from_numpy(np.array([d['Happy'], d['Sad'], d['Anger'], d['Surprise'], d['Disgust'], d['Fear']])))), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)
train_labels[0]

tensor(0)

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [9]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)

(19760, 3488)

In [10]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((19760, 512), (3488, 512))

In [11]:
train_y = np.array(train_labels) 
test_y = np.array(test_labels)
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((19760,), (3488,), 1.8427125506072874, 1.8136467889908257)

In [14]:
train_masks = [[0 if i == tokenizer.pad_token_id else 1 for i in ii] for ii in train_tokens_ids]
test_masks = [[0 if i == tokenizer.pad_token_id else 1 for i in ii] for ii in test_tokens_ids]

In [17]:
class XLNetSequenceClassifier(nn.Module):
    def __init__(self, dropout=0.0, num_labels=6):
        super(XLNetSequenceClassifier, self).__init__()

        self.bert = AutoModel.from_pretrained('bert-base-cased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_labels)
        #self.softmax = nn.Softmax()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        #proba = self.softmax(linear_output)
        return linear_output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

xl_clf = XLNetSequenceClassifier()

#from transformers import BertForSequenceClassification
#xl_clf = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 6)
xl_clf = xl_clf.to(device) 

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [20]:
BATCH_SIZE = 10
EPOCHS = 8

In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [22]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [23]:
param_optimizer = list(xl_clf.linear.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [24]:
optimizer = Adam(xl_clf.parameters(), lr=1e-5)

torch.cuda.empty_cache()

In [1]:
for epoch_num in range(EPOCHS):
    xl_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        #print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
 
        logits = xl_clf(token_ids, masks=masks)
        
        loss_func = nn.CrossEntropyLoss()
        print(logits)
        print(labels.long().squeeze())
        batch_loss = loss_func(logits, labels.long().squeeze())
        train_loss += batch_loss.item()
        
        
        xl_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=xl_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))


NameError: ignored

In [20]:
xl_clf.eval()
xl_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = xl_clf(token_ids, masks)
        loss_func = nn.CrossEntropyLoss()
        #print(logits)
        #print(labels.long().squeeze())
        loss = loss_func(logits, labels.long().squeeze())
        numpy_logits = logits.cpu().detach().numpy()
        #print(torch.argmax(logits, dim=1))
        
        xl_predicted += list(torch.argmax(logits, dim=1).cpu().detach().numpy())
        
        all_logits += list(numpy_logits[:, 0])

In [21]:
from sklearn.metrics import classification_report
print(test_y)
print(xl_predicted)
print(classification_report(test_y, xl_predicted))

[1 1 3 ... 0 0 5]
[5, 0, 0, 0, 0, 4, 4, 1, 2, 0, 0, 0, 4, 4, 0, 0, 2, 0, 1, 1, 0, 1, 1, 2, 2, 1, 5, 5, 0, 0, 0, 0, 0, 1, 0, 4, 1, 1, 0, 0, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 5, 5, 5, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 5, 5, 0, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 4, 4, 0, 0, 0, 0, 4, 4, 4, 4, 4, 5, 5, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 0, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 1, 0, 5, 5, 5, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 2, 2, 2, 4, 4, 4, 4, 4, 1, 5, 5, 5, 0, 0, 4, 4, 4, 1, 1, 1, 1, 0, 0, 4, 4, 5, 5, 5, 4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 2, 1, 1, 1, 1, 4, 1, 1, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 5, 5, 2, 0, 0, 0, 5, 5, 5, 0, 0, 0, 5, 0, 2, 2, 0, 0, 0, 

In [22]:
torch.save(xl_clf.state_dict(), "drive/My Drive/HT/BERT_CMU_MOSEI.pth")
tokenizer.save_pretrained("drive/My Drive/HT/BERT_CMU_MOSEI_TKN")

('drive/My Drive/HT/BERT_CMU_MOSEI_TKN/tokenizer_config.json',
 'drive/My Drive/HT/BERT_CMU_MOSEI_TKN/special_tokens_map.json',
 'drive/My Drive/HT/BERT_CMU_MOSEI_TKN/vocab.txt',
 'drive/My Drive/HT/BERT_CMU_MOSEI_TKN/added_tokens.json')