In [2]:
import pandas as pd

import random
import numpy as np

import torch

from sklearn.preprocessing import MultiLabelBinarizer

import ast

from trainer import Trainer, config

In [3]:
train_df = pd.read_csv('./data/CTP10/train.csv', index_col=0, encoding='utf8')
valid_df = pd.read_csv('./data/CTP10/validation.csv', index_col=0, encoding='utf8')
train_df.head()

Unnamed: 0_level_0,time_limit,memory_limit,description,tags,language
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1415/B,1.0,256.0,a street $$$ n $$$ house a line number $$$ 1 $...,"['bruteforce', 'bruteforce', 'greedy']",en
103/A,2.0,256.0,average miner vaganych take refresher course s...,"['greedy', 'implementation', 'math']",en
93/A,2.0,256.0,throughout igor k. 's life many situations wor...,['implementation'],en
31/A,2.0,256.0,professor vasechkin study evolution worm recen...,['implementation'],en
913/A,1.0,256.0,follow problem well - known : give integers n ...,"['implementation', 'math']",en


In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(config['seed'])

In [5]:
new_train_idx = [] 
selected_train_tags = [] 

for index in train_df.index:
    check = 0
    t = []  

    for tag in ast.literal_eval(train_df.loc[index]['tags']):
        if tag in config['tags']:
            check = 1
            t.append(tag)

    if check == 1:
        selected_train_tags.append(t)
        new_train_idx.append(index)

print(len(new_train_idx)) 

974


In [6]:
new_valid_idx = [] 
selected_valid_tags = [] 

for index in valid_df.index:
    check = 0
    t = []  

    for tag in ast.literal_eval(valid_df.loc[index]['tags']):
        if tag in config['tags']:
            check = 1
            t.append(tag)

    if check == 1:
        selected_valid_tags.append(t)
        new_valid_idx.append(index)

print(len(new_valid_idx)) 

325


In [7]:
train_df = train_df.loc[new_train_idx]
train_df['tags'] = selected_train_tags

valid_df = valid_df.loc[new_valid_idx]
valid_df['tags'] = selected_valid_tags

In [8]:
X_train = train_df['description']
X_valid = valid_df['description']

y_tags_train = train_df['tags']
y_tags_valid = valid_df['tags']

In [9]:
tag_label_encoder = MultiLabelBinarizer()

y_tags_train = tag_label_encoder.fit_transform(y_tags_train)
y_tags_valid = tag_label_encoder.transform(y_tags_valid)

In [10]:
def tokenizing(tokenizer, data, max_length):
    data = list(data.values)
    tokenized_data = tokenizer(data, padding=True, truncation=True, return_tensors='pt', max_length=max_length)

    return tokenized_data

def convert_to_tensor(data, dtype):
    tensor_data = torch.tensor(data, dtype=dtype)
    return tensor_data

In [11]:
tokenizer = config['tokenizer']
model = config['model']

In [12]:
tokenized_inputs_train = tokenizing(tokenizer, X_train, config['trainMaxLength'])
tokenized_inputs_valid = tokenizing(tokenizer, X_valid, config['validMaxLength'])

In [13]:
tags_labels_train = convert_to_tensor(y_tags_train, dtype=torch.float)
tags_labels_valid = convert_to_tensor(y_tags_valid, dtype=torch.float)

In [14]:
trainer = Trainer(model,
                tag_label_encoder,
                tokenized_inputs_train,
                tokenized_inputs_valid,
                tags_labels_train,
                tags_labels_valid,
                )

In [15]:
trainer.train()

100%|██████████| 243/243 [1:26:51<00:00, 21.45s/it]
100%|██████████| 82/82 [01:49<00:00,  1.34s/it]


Epoch 1/100, Train Loss: 0.0294, Valid Loss: 0.1117
tag acc Max Score in this epoch: 0.12615384615384614
tag valid Max F1 Score(macro) per class in this epoch: 0.3012989902488488
tag valid Max F1 Score(micro) per class in this epoch: 0.4228110599078341
tag valid Max F1 Score(weighted) per class in this epoch: 0.3973457965486768
tag valid Max F1 Score(samples) per class in this epoch: 0.4023785103785104

tag valid Max roc_auc_score avg in this epoch: 0.5526118637415185
bruteforce : 0.5344926291684556
constructivealgorithms : 0.6231997905210787
datastructures : 0.6247412008281573
dfsandsimilar : 0.4333596214511041
dp : 0.4617146128524402
geometry : 0.4810801513587891
greedy : 0.6238899147102788
implementation : 0.5867390133148619
math : 0.6176953540036705
strings : 0.5392063492063492

tag acc Max Score: 0 at 0epochs
tag valid Max F1 Score(macro) per class: 0 at 0epochs
tag valid Max F1 Score(micro) per class: 0 at 0epochs
tag valid Max F1 Score(weighted) per class: 0 at 0epochs
tag valid

100%|██████████| 243/243 [1:33:21<00:00, 23.05s/it]
100%|██████████| 82/82 [03:25<00:00,  2.50s/it]
 74%|███████▍  | 180/243 [1:13:40<21:25, 20.40s/it]