In [1]:
import torch
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import OneHotEncoder
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
#from transformers import AutoTokenizer, TFBertModel
from transformers import AutoTokenizer, BertModel,BertForSequenceClassification
from transformers import DistilBertForSequenceClassification,Trainer, TrainingArguments

2023-06-28 17:20:17.313458: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def data(filename="Doceree-HCP_Train.csv"):
    data =  pd.read_csv(filename,encoding='latin-1')
    if "TAXONOMY" in data.columns:
        data = data[~(data["TAXONOMY"].isna())].copy()
    data["text_info"] = (data["KEYWORDS"].astype(str)+" "+
                         data["USERCITY"].astype(str)+" "+
                         data["DEVICETYPE"].astype(str)+" "+
                         data["CHANNELTYPE"].astype(str)+" "+
                         data["USERAGENT"].astype(str)+" "+
                         data["URL"].astype(str)+" "+
                         data["PLATFORMTYPE"].astype(str)
                        )

    #data["TAXONOMY"].astype(str)
    return data

In [4]:
def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [6]:
train_data = data("Doceree-HCP_Train.csv")
test_data  = data("Doceree-HCP_Test.csv")
train_data['encoded_label'] = train_data['TAXONOMY'].astype('category').cat.codes

print(f"train size: {train_data.shape}")
print(f"test size: {test_data.shape}")

train_texts  = list(train_data["text_info"].values)
train_labels = [int(i) for i in train_data["encoded_label"].values]

train_labels = OneHotEncoder().fit_transform(np.array(train_labels).reshape(-1,1)).toarray()

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train size: (32313, 16)
test size: (28493, 13)


In [7]:
train_data['encoded_label'].unique().shape[0]

207

In [8]:
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#model = BertModel.from_pretrained("bert-base-uncased")
train_token_id = []
train_attention_masks = []
for sample in train_texts:
    train_encoding_dict = preprocessing(sample, tokenizer)
    train_token_id.append(train_encoding_dict['input_ids']) 
    train_attention_masks.append(train_encoding_dict['attention_mask'])
train_token_id = torch.cat(train_token_id, dim = 0)
train_attention_masks = torch.cat(train_attention_masks, dim = 0)
train_labels = torch.tensor(train_labels)


val_token_id = []
val_attention_masks = []
for sample in val_texts:
    val_encoding_dict = preprocessing(sample, tokenizer)
    val_token_id.append(val_encoding_dict['input_ids']) 
    val_attention_masks.append(val_encoding_dict['attention_mask'])
val_token_id = torch.cat(val_token_id, dim = 0)
val_attention_masks = torch.cat(val_attention_masks, dim = 0)
val_labels = torch.tensor(val_labels)
# train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_)
# val_encodings   = tokenizer(val_texts, truncation=True, padding=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
#train_labels[0:10]

In [10]:
class HCPDataset(torch.utils.data.Dataset):
    def __init__(self, token_id, attention_masks, labels):
        self.token_id = token_id
        self.attention_masks = attention_masks
        self.labels = labels

    def __getitem__(self, idx):
        item={}
        item["input_ids"] = self.token_id[idx]
        item["attention_mask"] = self.attention_masks[idx]
        item['labels'] = self.labels[idx].to(torch.float32)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HCPDataset(train_token_id, train_attention_masks, train_labels)
val_dataset = HCPDataset(val_token_id, val_attention_masks, val_labels)

In [11]:
#train_dataset.encodings.data["input_ids"][0]

In [12]:
#train_dataset.labels[1]

In [13]:
num_labels = train_data['encoded_label'].unique().shape[0]

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels, output_attentions= False, output_hidden_states = False,)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.

In [14]:
#model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
training_args = TrainingArguments(
    output_dir='./results_taxonomy',          
    num_train_epochs=1,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,
    evaluation_strategy="steps",
    # strength of weight decay
    #logging_dir='./logs1',           
    #logging_steps=1000,
    save_total_limit = 2,
    load_best_model_at_end=False,
    save_strategy = "no"
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

trainer.train()



Step,Training Loss,Validation Loss
500,0.2134,0.019071
1000,0.0191,0.017985
1500,0.0182,0.017375


TrainOutput(global_step=1616, training_loss=0.07886931226395144, metrics={'train_runtime': 839.363, 'train_samples_per_second': 30.797, 'train_steps_per_second': 1.925, 'total_flos': 3436801033881600.0, 'train_loss': 0.07886931226395144, 'epoch': 1.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.01728697493672371,
 'eval_runtime': 51.1213,
 'eval_samples_per_second': 126.425,
 'eval_steps_per_second': 1.976,
 'epoch': 1.0}

In [20]:
save_directory = "./Taxonomy_model"

In [21]:
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./Taxonomy_model/tokenizer_config.json',
 './Taxonomy_model/special_tokens_map.json',
 './Taxonomy_model/vocab.txt',
 './Taxonomy_model/added_tokens.json',
 './Taxonomy_model/tokenizer.json')

In [None]:
#model= DistilBertForSequenceClassification.from_pretrained(save_directory)
#tokenizer = DistilBertTokenizerFast.from_pretrained(save_directory)

In [22]:
#model.config

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model.to(device)
test_texts =list(test_data["text_info"].values)
batch_size = 64


# We need Token IDs and Attention Mask for inference on the new sentence
test_output = []
for i in range(0,len(test_texts),batch_size):
    test_ids = []
    test_attention_mask = []
    for sample in test_texts[i:i+batch_size]:
        encoding = preprocessing(sample, tokenizer)
        test_ids.append(encoding['input_ids']) 
        test_attention_mask.append(encoding['attention_mask'])
    
   
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)

    # Forward pass, calculate logit predictions
    with torch.no_grad():
        output = model(test_ids.to(device),attention_mask = test_attention_mask.to(device))
        test_output.append(output.logits.cpu().numpy())

In [24]:
test_predict = np.concatenate(test_output)

In [25]:
test_predict_f = np.argmax(test_predict, axis=1)

In [26]:
test_predict_f.shape

(28493,)

In [27]:
id2lbl = dict(zip(train_data['encoded_label'], train_data['TAXONOMY']))

In [30]:
test_predict_taxonomy = [id2lbl[i] for i in test_predict_f]

In [36]:
hpc_submission = pd.read_csv("Doceree-HCP_Submission.csv")

In [37]:
hpc_submission.head()

Unnamed: 0,ID,IS_HCP
0,115501,0
1,115502,1
2,115503,0
3,115504,0
4,115505,1


In [42]:
hpc_submission["TAXONOMY"] = test_predict_taxonomy

In [44]:
hpc_submission.to_csv("Doceree-TAXONOMY_Submission.csv", index=False)

In [46]:
hpc_submission.head()

Unnamed: 0,ID,IS_HCP,TAXONOMY
0,115501,0,2084P0800X
1,115502,1,2084P0800X
2,115503,0,2084P0800X
3,115504,0,2084N0400X
4,115505,1,2084P0800X
