In [1]:
# Libraries needed for data preparation
import pandas as pd
import numpy as np

# Download the dataset and put it in subfolder called data
datapath = "train_only_dialogue_window_1.csv"
df = pd.read_csv(datapath)
df = df[["text", "label"]]

# Show the data
df.head()

Unnamed: 0,text,label
0,\nThe following is a fragment of a conversatio...,probing
1,\nThe following is a fragment of a conversatio...,generic
2,\nThe following is a fragment of a conversatio...,probing
3,\nThe following is a fragment of a conversatio...,probing
4,\nThe following is a fragment of a conversatio...,telling


In [2]:
print('Total number of news: {}'.format(len(df)))
print(40*'-')
print('Split by category:')
print(df["label"].value_counts())
print(40*'-')
nr_categories = len(df["label"].unique())
print("Number of categories: {n}".format(n=nr_categories))

Total number of news: 12646
----------------------------------------
Split by category:
label
focus      5334
probing    3005
telling    2428
generic    1879
Name: count, dtype: int64
----------------------------------------
Number of categories: 4


In [3]:
X = df['text']
y=np.unique(df['label'], return_inverse=True)[1]
print(y)

[2 1 2 ... 2 3 3]


In [4]:
import transformers
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
import torch

X_list=X.to_list()
X_pt = tokenizer(X_list, padding='max_length', max_length = 512, truncation=True, return_tensors='pt')["input_ids"]

y_list=y.tolist()
y_pt = torch.Tensor(y_list).long()

In [6]:
datapath_test = "test_only_dialogue_window_1.csv"
df_test = pd.read_csv(datapath_test)
df_test = df_test[["text", "label"]]

X_test = df_test['text']
y_test=np.unique(df_test['label'], return_inverse=True)[1]

X_list_test=X_test.to_list()
X_pt_test = tokenizer(X_list_test, padding='max_length', max_length = 512, truncation=True, return_tensors='pt')["input_ids"]

y_list_test=y_test.tolist()
y_pt_test = torch.Tensor(y_list_test).long()

In [7]:
# Convert data to torch dataset

X_pt_train = X_pt
y_pt_train = y_pt
from torch.utils.data import Dataset, DataLoader
class BBCNewsDataset(Dataset):
    """Custom-built BBC News dataset"""

    def __init__(self, X, y):
        """
        Args:
            X, y as Torch tensors
        """
        self.X_train = X
        self.y_train = y
        

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.X_train[idx], self.y_train[idx]# Get train and test data in form of Dataset class
train_data_pt = BBCNewsDataset(X=X_pt_train, y=y_pt_train)
test_data_pt = BBCNewsDataset(X=X_pt_test, y=y_pt_test)

In [8]:
# Get train and test data in form of Dataloader class
train_loader_pt = DataLoader(train_data_pt, batch_size=50, shuffle=True)
test_loader_pt = DataLoader(test_data_pt, batch_size=50, shuffle=True)

In [9]:
config = transformers.DistilBertConfig(dropout=0.2, attention_dropout=0.2)
dbert_pt = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

In [10]:
# Let's create a sample of size 5 from the training data
sample = X_pt_train[0:5]
print('Object type: ', type(dbert_pt(sample)))
print('Output format (shape): ',dbert_pt(sample)[0].shape)
print('Output used as input for the classifier (shape): ', dbert_pt(sample)[0][:,0,:].shape)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Object type:  <class 'transformers.modeling_outputs.BaseModelOutput'>
Output format (shape):  torch.Size([5, 512, 768])
Output used as input for the classifier (shape):  torch.Size([5, 768])


In [11]:
from torch import nn
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class DistilBertClassification(nn.Module):
    def __init__(self):
        super(DistilBertClassification, self).__init__()
        self.dbert = dbert_pt
        self.dropout = nn.Dropout(p=0.2)
        self.linear1 = nn.Linear(768,64)
        self.ReLu = nn.ReLU()
        self.linear2 = nn.Linear(64,5)

    def forward(self, x):
        x = self.dbert(input_ids=x)
        x = x["last_hidden_state"][:,0,:]
        x = self.dropout(x)
        x = self.linear1(x)
        x = self.ReLu(x)
        logits = self.linear2(x)
        # No need for a softmax, because it is already included in the CrossEntropyLoss
        return logits

model_pt = DistilBertClassification().to(device)

Using cuda device


In [12]:
for param in model_pt.dbert.parameters():
    param.requires_grad = False

In [13]:
total_params = sum(p.numel() for p in model_pt.parameters())
total_params_trainable = sum(p.numel() for p in model_pt.parameters() if p.requires_grad)
print("Number of parameters: ", total_params)
print("Number of trainable parameters: ", total_params_trainable)

Number of parameters:  66412421
Number of trainable parameters:  49541


In [14]:
epochs = 5
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_pt.parameters())

from tqdm import tqdm
# Define the dictionary "history" that will collect key performance indicators during training
history = {}
history["epoch"]=[]
history["train_loss"]=[]
history["valid_loss"]=[]
history["train_accuracy"]=[]
history["valid_accuracy"]=[]

from datetime import datetime
# Measure time for training
start_time = datetime.now()

# Loop on epochs
for e in range(epochs):
    
    # Set mode in train mode
    model_pt.train()
    
    train_loss = 0.0
    train_accuracy = []
    
    # Loop on batches
    for X, y in tqdm(train_loader_pt):
        # Get prediction & loss
        prediction = model_pt(X.to(device))
        loss = criterion(prediction, y.to(device))
        
        # Adjust the parameters of the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        prediction_index = prediction.argmax(axis=1)
        accuracy = (prediction_index==y.to(device))
        train_accuracy += accuracy
    
    train_accuracy = (sum(train_accuracy) / len(train_accuracy)).item()
    
    # Calculate the loss on the test data after each epoch
    # Set mode to evaluation (by opposition to training)
    model_pt.eval()
    valid_loss = 0.0
    valid_accuracy = []
    for X, y in tqdm(test_loader_pt):
        
        prediction = model_pt(X.to(device))
        loss = criterion(prediction, y.to(device))

        valid_loss += loss.item()
        
        prediction_index = prediction.argmax(axis=1)
        accuracy = (prediction_index==y.to(device))
        valid_accuracy += accuracy
    valid_accuracy = (sum(valid_accuracy) / len(valid_accuracy)).item()
    
    # Populate history
    history["epoch"].append(e+1)
    history["train_loss"].append(train_loss / len(train_loader_pt))
    history["valid_loss"].append(valid_loss / len(test_loader_pt))
    history["train_accuracy"].append(train_accuracy)
    history["valid_accuracy"].append(valid_accuracy)    
        
    print(f'Epoch {e+1} \t\t Training Loss: {train_loss / len(train_loader_pt) :10.3f} \t\t Validation Loss: {valid_loss / len(test_loader_pt) :10.3f}')
    print(f'\t\t Training Accuracy: {train_accuracy :10.3%} \t\t Validation Accuracy: {valid_accuracy :10.3%}')
    
# Measure time for training
end_time = datetime.now()
training_time_pt = (end_time - start_time).total_seconds()

  0%|          | 0/253 [00:04<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 76.00 MiB. GPU 0 has a total capacty of 23.62 GiB of which 86.19 MiB is free. Process 2627555 has 22.36 GiB memory in use. Including non-PyTorch memory, this process has 774.00 MiB memory in use. Of the allocated memory 490.85 MiB is allocated by PyTorch, and 85.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
ax[0].set(title='Loss')
ax[0].plot(history['train_loss'], label='Training')
ax[0].plot(history['valid_loss'], label='Validation')
ax[0].legend(loc="upper right")

ax[1].set(title='Accuracy')
ax[1].plot(history['train_accuracy'], label='Training')
ax[1].plot(history['valid_accuracy'], label='Validation')
ax[1].legend(loc="lower right")

In [None]:
accuracy_pt = history['valid_accuracy'][-1]
print('Accuracy Training data: {:.1%}'.format(history['train_accuracy'][-1]))
print('Accuracy Test data: {:.1%}'.format(history['valid_accuracy'][-1]))
print('Training time: {:.1f}s (or {:.1f} minutes)'.format(training_time_pt, training_time_pt/60))

In [None]:
# Save only the parameters of the model but not the model itself, and get it back
torch.save(model_pt.state_dict(), 'PyModel_window_1.sd')
model_reloaded = DistilBertClassification()
model_reloaded.load_state_dict(torch.load('PyModel_window_1.sd'))
model_reloaded.eval()

In [None]:
# Save the entire model, and get it back
torch.save(model_pt, 'PyModelComplete_window_1.pt')
model_reloaded2 = torch.load('PyModelComplete_window_1.pt')
model_reloaded2.eval()

In [None]:
from sklearn.metrics import classification_report

prediction = model_pt(X_pt_test[:5].to(device)).argmax(axis=1)
report = classification_report(y_pt_test[:5], prediction.detach().cpu().numpy())

In [None]:
print(report)