# Overview

# Module Import

In [1]:
import pandas as pd
import os
from transformers import pipeline
import ast
from collections import Counter
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import numpy as np

import torch

# Data Load and Preprocessing

In [70]:
starwars_df = pd.read_csv('./min_sample_train.csv',index_col='idx', converters={'labels': ast.literal_eval})
starwars_df['text'] = starwars_df['term'] + ', ' + starwars_df['definition']
starwars_df.drop(columns=['page','block','term','definition'],inplace=True)

test_df = pd.read_csv('./sample_train.csv',index_col='idx')
test_df['text'] = test_df['term'] + ', ' + test_df['definition']
test_df.drop(columns=['page','block','term','definition'],inplace=True)

starwars_df.head()


Unnamed: 0_level_0,labels,text
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
455,[Republic],"All Stars Burn as One, The official anthem of ..."
1534,[Creature],"bocatt, A tusked, leather-skinned predator fou..."
3142,[Clan],"Daughters of Allya, The name adopted by the Da..."
322,[Character],"Aidus, A Rattataki guard who served Asajj Vent..."
8710,"[Imperial, Faction]","Insurrection, This branch of the Pentastar Ali..."


# Label Mapping

In [3]:
all_labels = sorted(list(set([label for labels in starwars_df['labels'] for label in labels])))
id2label = {idx:label for idx,label in enumerate(all_labels)}
label2id = {label:idx for idx,label in enumerate(all_labels)}

In [4]:
#We apply one-hot encoding for the labels
def encode_labels(example):
    #Sets the dimension for the example
    encoded_labels = [0] * len(all_labels)
    
    for label in example['labels']:
        encoded_labels[label2id[label]] = 1
    
    return {'labels': torch.tensor(encoded_labels, dtype=torch.float32)}
    #return {'labels': encoded_labels}

In [5]:
starwars_dataset = Dataset.from_pandas(starwars_df)
starwars_dataset = starwars_dataset.map(encode_labels,batched=False)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

In [6]:
train_test_split = starwars_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(train_dataset[0])


{'labels': [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'text': 'Amee, One of Anakin Skywalker’s childhood friends in the slave quarters of Mos Espa, Amee worked as a house slave for a wealthy Toong couple. Her mother, Hala, was kidnapped in a slave raid by the pirate Krayn. She was three years younger than Anakin and attended the wedding of Shmi and Cliegg Lars.', 'idx': 521}


In [7]:
starwars_df.head()

Unnamed: 0_level_0,labels,text
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
455,[Republic],"All Stars Burn as One, The official anthem of ..."
1534,[Creature],"bocatt, A tusked, leather-skinned predator fou..."
3142,[Clan],"Daughters of Allya, The name adopted by the Da..."
322,[Character],"Aidus, A Rattataki guard who served Asajj Vent..."
8710,"[Imperial, Faction]","Insurrection, This branch of the Pentastar Ali..."


In [8]:
model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        truncation=True, 
        padding="max_length",
        max_length=128
    )

In [10]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/79 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [11]:
tokenized_train_dataset

Dataset({
    features: ['labels', 'text', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 79
})

In [None]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text','idx'])
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(['text','idx'])

In [None]:
tokenized_train_dataset.set_format('torch',columns=['input_ids','attention_mask','token_type_ids','labels'])
tokenized_eval_dataset.set_format('torch',columns=['input_ids','attention_mask','token_type_ids','labels'])



In [14]:
import evaluate
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss #multi-label metrics
metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    sigmoid = np.vectorize(lambda x: 1/(1 + np.exp(-x)))
    predictions = (sigmoid(logits) >0.5).astype(int)
    
    f1_micro = f1_score(labels, predictions, average='macro', zero_division=0)
    precision_micro = precision_score(labels, predictions, average='micro')
    recall_micro = recall_score(labels,predictions,average='micro')
    
    f1_macro = f1_score(labels, predictions,average='macro', zero_division=0)
    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0)
    recall_macro = recall_score(labels, predictions, average='macro', zero_division=0)
    
    h_loss = hamming_loss(labels, predictions)
    
    return {
        'f1_micro': f1_micro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_macro': f1_macro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'hamming_loss': h_loss
    }
    
    

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding


model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(all_labels),
    problem_type='multi_label_classification',
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_micro',
    greater_is_better=True,
    push_to_hub=False,
    report_to='tensorboard'
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_eval_dataset,
    #tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [29]:
trainer.train()
results = trainer.evaluate()

Epoch,Training Loss,Validation Loss,F1 Micro,Precision Micro,Recall Micro,F1 Macro,Precision Macro,Recall Macro,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,No log,0.655893,0.052777,0.042857,0.2,0.052777,0.038708,0.15,0.316,0.2221,90.046,9.005
2,No log,0.649696,0.022222,0.030534,0.133333,0.022222,0.016381,0.105,0.306,0.2037,98.172,9.817
3,No log,0.640263,0.014286,0.026087,0.1,0.014286,0.014,0.085,0.278,0.2028,98.632,9.863
4,No log,0.628842,0.007619,0.018692,0.066667,0.007619,0.004,0.08,0.266,0.2059,97.123,9.712
5,No log,0.616644,0.007619,0.019231,0.066667,0.007619,0.004,0.08,0.26,0.2016,99.197,9.92
6,No log,0.60518,0.016508,0.042105,0.133333,0.016508,0.009,0.12,0.234,0.2301,86.912,8.691


In [30]:
results = trainer.evaluate()
print('Evaluation results:', results)

Evaluation results: {'eval_loss': 0.6558932065963745, 'eval_f1_micro': 0.05277684407096172, 'eval_precision_micro': 0.04285714285714286, 'eval_recall_micro': 0.2, 'eval_f1_macro': 0.05277684407096172, 'eval_precision_macro': 0.03870843776106934, 'eval_recall_macro': 0.15, 'eval_hamming_loss': 0.316, 'eval_runtime': 0.4385, 'eval_samples_per_second': 45.606, 'eval_steps_per_second': 4.561, 'epoch': 6.0}


In [19]:
model_save_path = './fine_tuned_star_wars_classifier'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)


('./fine_tuned_star_wars_classifier/tokenizer_config.json',
 './fine_tuned_star_wars_classifier/special_tokens_map.json',
 './fine_tuned_star_wars_classifier/vocab.txt',
 './fine_tuned_star_wars_classifier/added_tokens.json',
 './fine_tuned_star_wars_classifier/tokenizer.json')

In [20]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

model_path = './fine_tuned_star_wars_classifier'
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSequenceClassification.from_pretrained(model_path, problem_type='multi_label_classification')

classifier = pipeline(
    'text-classification',
    model=model,
    tokenizer=tokenizer,
    top_k=None,
    
    max_length=512,
    truncation=True,
    padding = True
)



Device set to use cuda:0


In [35]:

text = 'bocatt, A tusked, leather-skinned predator found on Tatooine.'
prediction_results = classifier(text)

list_of_label_dicts = prediction_results[0]

predicted_scores_pipeline = []
predicted_labels_pipeline = []
for label_info in list_of_label_dicts:
    if label_info['score'] > 0.65:
        print(label_info['label'])
        print(label_info['score'])
    
    #predicted_labels_pipeline.append(label_info['label'])
    #predicted_scores_pipeline.append(label_info['score'])


Character
0.6502765417098999


In [59]:
def prediction(text):
    prediction_results = classifier(text)
    
    list_of_label_dicts = prediction_results[0]
    
    predicted_scores_pipeline = []
    predicted_labels_pipeline = []
    for label_info in list_of_label_dicts:
        if label_info['score'] > 0.65:
            predicted_labels_pipeline.append(label_info['label'])
            predicted_scores_pipeline.append(label_info['score'])
    
    return predicted_labels_pipeline, predicted_scores_pipeline

In [None]:
temp = test_df['text'].apply(prediction)

 

In [None]:
test_df[['predicted_labels','predicted_scores']] = pd.DataFrame(temp.tolist(),index=test_df.index).values

In [72]:
test_df

Unnamed: 0_level_0,text,predicted_labels,predicted_scores
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
455,"All Stars Burn as One, The official anthem of ...","[Rebel, Ship]","[0.7018588185310364, 0.6653783917427063]"
1534,"bocatt, A tusked, leather-skinned predator fou...",[Character],[0.6502765417098999]
3142,"Daughters of Allya, The name adopted by the Da...",[],[]
322,"Aidus, A Rattataki guard who served Asajj Vent...",[],[]
8710,"Insurrection, This branch of the Pentastar Ali...",[Rebel],[0.6791582703590393]
...,...,...,...
671,"Aora, Aruk Besadii, A corpulent Hutt on Nal Hu...",[Rebel],[0.674552321434021]
1852,"B’wuf, A senior technical analyst aboard the S...",[Rebel],[0.6703217625617981]
11510,"manadept, One of the many types of domesticate...",[],[]
52,"AAP blaster box, A series of strap-on laser we...",[Rebel],[0.6612579226493835]


In [74]:
test_df.to_csv('./predictions.csv')

In [None]:
from tqdm.auto import tqdm

# hook tqdm into pandas
tqdm.pandas()
starwars_df['classification'] = starwars_df['text'].progress_apply(prediction)