In [None]:
import os
import urllib.request
import pandas as pd
import json
from zipfile import ZipFile


#Download dataset

In [None]:
#https://github.com/DARGMINTS/op-articles-arg-pt
dataset_url='https://raw.githubusercontent.com/DARGMINTS/op-articles-arg-pt/main/op_articles_arg_pt_corpus.json'
dataset_folder = os.path.join(os.getcwd(), 'dataset')
print("Current work directory: " + str(dataset_folder))

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

dataset_path = os.path.join(dataset_folder, 'op_articles_arg_pt_corpus.json')
if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(dataset_url, dataset_path)

Current work directory: /content/dataset


In [None]:
df = pd.read_json(dataset_path)
print(df['argument_annotations'][0])
for i in df['argument_annotations'][0]:
  json_str = json.dumps(i, indent=4)
  #print(json_str)

[{'nodes': [{'id': 0, 'x': 66, 'y': 143, 'width': 202, 'height': 64, 'type': 'I', 'color': 'blue', 'scheme': 0, 'visible': True, 'ranges': [[2516, 2556]], 'label': 'Valor'}, {'id': 1, 'x': 100, 'y': 384, 'width': 191, 'height': 184, 'type': 'I', 'color': 'blue', 'scheme': 0, 'visible': True, 'ranges': [[2568, 2806]], 'label': 'Valor'}, {'id': 2, 'x': 151, 'y': 324, 'width': 56, 'height': 40, 'type': 'RA', 'color': 'green', 'scheme': 72, 'visible': True, 'ranges': [], 'label': 'nullADU'}, {'id': 3, 'x': 539, 'y': 96, 'width': 122, 'height': 64, 'type': 'I', 'color': 'blue', 'scheme': 0, 'visible': True, 'ranges': [[3169, 3190]], 'label': 'Valor'}, {'id': 4, 'x': 523, 'y': 295, 'width': 193, 'height': 104, 'type': 'I', 'color': 'blue', 'scheme': 0, 'visible': True, 'ranges': [[3198, 3285]], 'label': 'Valor'}, {'id': 5, 'x': 565, 'y': 219, 'width': 56, 'height': 40, 'type': 'RA', 'color': 'green', 'scheme': 72, 'visible': True, 'ranges': [], 'label': 'nullADU'}, {'id': 6, 'x': 622, 'y': 5

In [None]:
print(df)

                                      _id  \
0    {'$oid': '5d04a31b896a7fea069ef06f'}   
1    {'$oid': '5d04a3fc896a7fea069f0717'}   
2    {'$oid': '5d04a455896a7fea069f07ab'}   
3    {'$oid': '5d04a52f896a7fea069f0921'}   
4    {'$oid': '5d04a8d7896a7fea069f6997'}   
..                                    ...   
368  {'$oid': '5cee2df3896a7fea06c54a35'}   
369  {'$oid': '5ceee4c4896a7fea06cc3895'}   
370  {'$oid': '5cef7f74896a7fea06d223f7'}   
371  {'$oid': '5cefd3d4896a7fea06d57241'}   
372  {'$oid': '5cf4b764896a7fea06032673'}   

                                    authors  \
0                    [José Vítor Malheiros]   
1                         [Rui J. Baptista]   
2                           [Fernando Belo]   
3    [Hamad bin Khalifa bin Ahmad Al Thani]   
4                          [Carlos Nolasco]   
..                                      ...   
368                            [Nuno Sousa]   
369                 [Francisco Bethencourt]   
370                          [Nuno P

#Analyse dataset

In [None]:
ADU = {}
n_documents_1 = len(df['_id'])
print("Documents:",n_documents_1)

for i in range(n_documents_1):
  for j in range(len(df['argument_annotations'][i])):
    c = len(df['argument_annotations'][i][j]['nodes'])
    annId = df['argument_annotations'][i][j]['metadata']['annotatorId']
    ADU[annId] = ADU.get(annId,0) + c
print("ADU:",ADU)

links = {}
for i in range(n_documents_1):
  for j in range(len(df['argument_annotations'][i])):
    c = len(df['argument_annotations'][i][0]['edges'])
    annId = df['argument_annotations'][i][j]['metadata']['annotatorId']
    links[annId] = links.get(annId,0) + c
print("Links:",links)

Documents: 373
ADU: {'A': 5444, 'B': 8289, 'D': 6593, 'C': 6683}
Links: {'A': 4273, 'B': 4949, 'D': 5377, 'C': 5474}


In [None]:
labels = set()
data = df['argument_annotations']

for i in range(len(data)):
  for j in range(len(data[i])):
    data_list = data[i]
    labels.update(set(item['label'] for item in data_list[j]['nodes']))
print("Label types:", labels)
label_counts = {key:0 for key in labels}

num_labels = {key:label_counts.copy() for key in ["A","B","C","D"]}

for i in range(len(data)):
    for j in range(len(data[i])):
        annId = data[i][j]['metadata']['annotatorId']
        data_list = data[i][j]['nodes']
        for label in labels:
            count = sum(1 for item in data_list if item['label'] == label)
            num_labels[annId][label] = num_labels[annId].get(label,0) + count
for i in num_labels:
  print("Labels:",i,num_labels[i])


Label types: {'Facto', 'Valor(+)', 'nullADU', 'Diretiva', 'Valor', 'Valor(-)'}
Labels: A {'Facto': 647, 'Valor(+)': 183, 'nullADU': 2109, 'Diretiva': 56, 'Valor': 2059, 'Valor(-)': 390}
Labels: B {'Facto': 920, 'Valor(+)': 466, 'nullADU': 3063, 'Diretiva': 167, 'Valor': 2790, 'Valor(-)': 883}
Labels: C {'Facto': 386, 'Valor(+)': 481, 'nullADU': 2572, 'Diretiva': 265, 'Valor': 2006, 'Valor(-)': 973}
Labels: D {'Facto': 1710, 'Valor(+)': 281, 'nullADU': 2522, 'Diretiva': 179, 'Valor': 1247, 'Valor(-)': 654}


In [None]:
total_sum =0
for inner_dict in num_labels.values():
    for value in inner_dict.values():
        total_sum += value
print(total_sum)

27009


In [None]:
num_char = {key:[0,0,0,0,0] for key in ["A","B","C","D"]}

for i in range(len(data)):
  for j in range(len(data[i])):
    data_list = data[i][j]['nodes']
    annId = data[i][j]['metadata']['annotatorId']
    rmin = 9999
    rmax = 0
    for k  in range(len(data_list)):
      try:
        ranges = data_list[k]['ranges'][0]
      except:
        continue
      for r in range(len(data_list[k]['ranges'])):
        ranges = data_list[k]['ranges'][r]
        rmin = min(rmin, ranges[0])
        rmax = max(rmax, ranges[1])
        l= ranges[1] - ranges[0]
        num_char[annId][0] += l
        num_char[annId][4] += 1
        if num_char[annId][1] < l:
          num_char[annId][1] = l
        num_char[annId][2] += (rmax - rmin)
        num_char[annId][3] = max(num_char[annId][3], rmax - rmin)
print("\n ADU: \n")
for annId in ["A","B","C","D"]:
  print("Total Average:",annId,round(num_char[annId][0]/ num_char[annId][4],2), "\nTotal Max:",annId,num_char[annId][1])

print("\n Documents: \n")
for annId in ["A","B","C","D"]:
  print("Average:",annId,round(num_char[annId][2]/ num_char[annId][4],2), "\nMax:",annId,num_char[annId][3])



 ADU: 

Total Average: A 78.15 
Total Max: A 514
Total Average: B 90.86 
Total Max: B 478
Total Average: C 89.86 
Total Max: C 546
Total Average: D 91.72 
Total Max: D 478

 Documents: 

Average: A 2357.39 
Max: A 16428
Average: B 2484.84 
Max: B 18414
Average: C 2993.38 
Max: C 18156
Average: D 2227.13 
Max: D 12947


In [None]:
# Dictionary to keep track of edge counts
edge_counts = {}

# Iterate over documents
for i in range(len(data)):
    annotations = data[i]
    # Iterate over annotations
    for annotation in annotations:
        annotator_id = annotation["metadata"]["annotatorId"]
        edges = annotation["edges"]

        # Increment edge count for the corresponding annotator
        if annotator_id not in edge_counts:
            edge_counts[annotator_id] = 0
        edge_counts[annotator_id] += len(edges)

print("Edge Counts:")
print(edge_counts)

Edge Counts:
{'A': 4273, 'B': 6580, 'D': 5247, 'C': 5314}


In [None]:
# Dictionary to keep track of label pairs for each annotator
annotator_label_counts = {}

# Iterate over documents
for i in range(len(data)):
    annotations = data[i]

    # Iterate over annotations
    for annotation in annotations:
        annotator_id = annotation["metadata"]["annotatorId"]
        edges = annotation["edges"]
        nodes = annotation["nodes"]

        # Iterate over edges
        for edge in edges:
            from_id = edge["from"]["id"]
            to_id = edge["to"]["id"]

            # Find nodes in 'nodes' by id
            from_node = next((node for node in nodes if node["id"] == from_id), None)
            to_node = next((node for node in nodes if node["id"] == to_id), None)

            if from_node and to_node:
                from_label = from_node["label"]
                to_label = to_node["label"]

                # Increment the count for the label pair for the corresponding annotator
                if annotator_id not in annotator_label_counts:
                    annotator_label_counts[annotator_id] = {}

                if (from_label, to_label) not in annotator_label_counts[annotator_id]:
                    annotator_label_counts[annotator_id][(from_label, to_label)] = 0

                annotator_label_counts[annotator_id][(from_label, to_label)] += 1

print("Label Counts by Annotator:")
for i in annotator_label_counts:
  print("Links:",i,annotator_label_counts[i])

Label Counts by Annotator:
Links: A {('Valor', 'nullADU'): 1258, ('nullADU', 'Valor'): 1412, ('Facto', 'nullADU'): 554, ('nullADU', 'Facto'): 195, ('Valor(-)', 'nullADU'): 242, ('Valor(+)', 'nullADU'): 103, ('nullADU', 'Valor(+)'): 155, ('nullADU', 'Valor(-)'): 281, ('nullADU', 'Diretiva'): 66, ('Diretiva', 'nullADU'): 7}
Links: B {('Valor(-)', 'nullADU'): 568, ('nullADU', 'Valor(-)'): 619, ('Valor', 'nullADU'): 1830, ('nullADU', 'Valor'): 1660, ('nullADU', 'Valor(+)'): 331, ('Facto', 'nullADU'): 800, ('nullADU', 'Facto'): 285, ('Valor(+)', 'nullADU'): 279, ('nullADU', 'Diretiva'): 168, ('Diretiva', 'nullADU'): 40}
Links: D {('Facto', 'nullADU'): 1732, ('nullADU', 'Valor'): 1165, ('Valor(-)', 'nullADU'): 366, ('nullADU', 'Valor(-)'): 559, ('Valor', 'nullADU'): 414, ('Valor(+)', 'nullADU'): 150, ('nullADU', 'Valor(+)'): 232, ('nullADU', 'Facto'): 401, ('nullADU', 'Diretiva'): 165, ('Diretiva', 'nullADU'): 63}
Links: C {('Valor', 'nullADU'): 1388, ('nullADU', 'Valor'): 1154, ('Facto', 'n

#Download article text

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
from tqdm import tqdm

# Function to extract article text from a webpage given its URL
def extract_article_text(row):
    url = row['url_canonical']
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    article_element = soup.find("div", class_="story__body")
    title_element = soup.find("h1", class_="headline")
    subtitle_element = soup.find("div", class_="story__blurb")
    article_text = article_element.get_text() if article_element else None
    title_text = len(title_element.get_text()) if title_element else None
    subtitle_text = len(subtitle_element.get_text()) if subtitle_element else None

    row['article_text'] = article_text
    row['title_text'] = title_text
    row['subtitle_text'] = subtitle_text

    return row

# Number of threads to use
num_threads = 8

# Split the dataframe into smaller chunks for parallel processing
chunks = np.array_split(df, num_threads)

# Create a ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=num_threads)

# List to store the futures
futures = []

# Submit the tasks to the executor
for chunk in chunks:
    future = executor.submit(chunk.apply, extract_article_text, axis=1)
    futures.append(future)

# Wait for all the tasks to complete and display a progress bar
with tqdm(total=len(futures)) as pbar:
    for future in as_completed(futures):
        result = future.result()
        pbar.update(1)

# Concatenate the results back into a single DataFrame
df = pd.concat([future.result() for future in futures])

# Now the DataFrame 'df' will have three additional columns: 'article_text', 'title_text', and 'subtitle_text'


100%|██████████| 8/8 [01:03<00:00,  7.92s/it]


In [None]:
pd.set_option('display.max_colwidth', 1000)
print(df)

                                      _id  \
0    {'$oid': '5d04a31b896a7fea069ef06f'}   
1    {'$oid': '5d04a3fc896a7fea069f0717'}   
2    {'$oid': '5d04a455896a7fea069f07ab'}   
3    {'$oid': '5d04a52f896a7fea069f0921'}   
4    {'$oid': '5d04a8d7896a7fea069f6997'}   
..                                    ...   
368  {'$oid': '5cee2df3896a7fea06c54a35'}   
369  {'$oid': '5ceee4c4896a7fea06cc3895'}   
370  {'$oid': '5cef7f74896a7fea06d223f7'}   
371  {'$oid': '5cefd3d4896a7fea06d57241'}   
372  {'$oid': '5cf4b764896a7fea06032673'}   

                                    authors  \
0                    [José Vítor Malheiros]   
1                         [Rui J. Baptista]   
2                           [Fernando Belo]   
3    [Hamad bin Khalifa bin Ahmad Al Thani]   
4                          [Carlos Nolasco]   
..                                      ...   
368                            [Nuno Sousa]   
369                 [Francisco Bethencourt]   
370                          [Nuno P

In [None]:
import pandas as pd
import numpy as np
# Initialize an empty list to store the output dictionaries
output_list = []

for index, row in df.iterrows():
    annotations = row['argument_annotations']

    # Iterate over each annotation in the row
    for annotation in annotations:
        nodes = annotation['nodes']
        for node in nodes:
            ranges = node['ranges']
            label = node['label']
            annId = annotation['metadata']['annotatorId']
            #step = row['title_text'] + row['subtitle_text']
            #if np.isnan(step):
            #  step=0
            # Iterate over each range in the node
            for start, end in ranges:
                #segment = row['article_text'][int(start-step):int(end-step)]
                segment = row['article_text'][start:end]
                # Create a dictionary with the output values
                output = {
                    'annotatorId': annId,
                    'Label': label,
                    'Segment': segment
                }

                # Append the output dictionary to the list
                output_list.append(output)
# Create the output dataframe using pd.DataFrame
all_sets = pd.DataFrame(output_list)

# Print the output dataframe
print(all_sets)


      annotatorId     Label  \
0               A     Valor   
1               A     Valor   
2               A     Valor   
3               A     Valor   
4               A     Valor   
...           ...       ...   
16893           D     Valor   
16894           D     Facto   
16895           D  Valor(-)   
16896           D     Valor   
16897           D     Facto   

                                                                                                                                                                                                                                              Segment  
0                                                                                                                                                                                                            e jornalísticas. O que é espantoso é que  
1      singularmente objectiva. O grosso do “humor” era apenas uma colagem inteligente de notícias sobre a FIFA. O humor nascia do 

#Create sets

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Identifica i tipi di annotatori unici nel dataframe
annotator_types = all_sets['annotatorId'].unique()

# Filtra i dataframe per i tipi di annotatori specifici
annotator_d_df = all_sets[all_sets['annotatorId'] == 'D']
annotator_b_df = all_sets[all_sets['annotatorId'] == 'B']
annotators_ad_df = all_sets[all_sets['annotatorId'].isin(['A', 'D'])]

# Crea i set di addestramento, validazione e test per l'annotatore D
train_D_df, temp_d_df = train_test_split(annotator_d_df, test_size=0.3, random_state=42)
val_d_df, test_d_df = train_test_split(temp_d_df, test_size=0.5, random_state=42)

train_B_df, temp_b_df = train_test_split(annotator_b_df, test_size=0.3, random_state=42)
val_b_df, test_b_df = train_test_split(temp_b_df, test_size=0.5, random_state=42)
# Crea i set di addestramento, validazione e test per gli annotatori A e D insieme
train_ad_df, temp_ad_df = train_test_split(annotators_ad_df, test_size=0.3, random_state=42)
val_ad_df, test_ad_df = train_test_split(temp_ad_df, test_size=0.5, random_state=42)

# Crea i set di addestramento, validazione e test per tutti gli annotatori insieme
train_all_df, temp_all_df = train_test_split(all_sets, test_size=0.3, random_state=42)
val_all_df, test_all_df = train_test_split(temp_all_df, test_size=0.5, random_state=42)

# Salva i set di addestramento, validazione e test in nuovi dataframe o file
train_D_df.to_csv('train_D.csv', index=False)
val_d_df.to_csv('validation_D.csv', index=False)
test_d_df.to_csv('test_D.csv', index=False)

train_B_df.to_csv('train_B.csv', index=False)
val_b_df.to_csv('validation_B.csv', index=False)
test_b_df.to_csv('test_B.csv', index=False)

train_ad_df.to_csv('train_A_D.csv', index=False)
val_ad_df.to_csv('validation_A_D.csv', index=False)
test_ad_df.to_csv('test_A_D.csv', index=False)

train_all_df.to_csv('train_all.csv', index=False)
val_all_df.to_csv('validation_all.csv', index=False)
test_all_df.to_csv('test_all.csv', index=False)


In [None]:
from google.colab import drive

drive.mount('/content/drive')

directory_path = '/content/drive/MyDrive/pw_nlp/'

# Save the dataframes to the specified directory
train_D_df.to_csv(directory_path + 'train_D.csv', index=False)
val_d_df.to_csv(directory_path + 'validation_D.csv', index=False)
test_d_df.to_csv(directory_path + 'test_D.csv', index=False)

train_B_df.to_csv(directory_path + 'train_B.csv', index=False)
val_b_df.to_csv(directory_path + 'validation_B.csv', index=False)
test_b_df.to_csv(directory_path + 'test_B.csv', index=False)

train_ad_df.to_csv(directory_path + 'train_A_D.csv', index=False)
val_ad_df.to_csv(directory_path + 'validation_A_D.csv', index=False)
test_ad_df.to_csv(directory_path + 'test_A_D.csv', index=False)

train_all_df.to_csv(directory_path + 'train_all.csv', index=False)
val_all_df.to_csv(directory_path + 'validation_all.csv', index=False)
test_all_df.to_csv(directory_path + 'test_all.csv', index=False)


Mounted at /content/drive


#Train model

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m108.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [None]:
import pandas as pd
import torch
import transformers
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset


In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
directory_path = '/content/drive/MyDrive/pw_nlp/'

# Load the dataframes
train_D_df = pd.read_csv(directory_path + 'train_D.csv')
val_d_df = pd.read_csv(directory_path + 'validation_D.csv')
test_d_df = pd.read_csv(directory_path + 'test_D.csv')

train_B_df = pd.read_csv(directory_path + 'train_B.csv')
val_b_df = pd.read_csv(directory_path + 'validation_B.csv')
test_b_df = pd.read_csv(directory_path + 'test_B.csv')

train_ad_df = pd.read_csv(directory_path + 'train_A_D.csv')
val_ad_df = pd.read_csv(directory_path + 'validation_A_D.csv')
test_ad_df = pd.read_csv(directory_path + 'test_A_D.csv')

train_all_df = pd.read_csv(directory_path + 'train_all.csv')
val_all_df = pd.read_csv(directory_path + 'validation_all.csv')
test_all_df = pd.read_csv(directory_path + 'test_all.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
labels = list(labels)
print(labels)

['Facto', 'Valor(+)', 'nullADU', 'Diretiva', 'Valor', 'Valor(-)']


In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = self.data['Segment']
        self.targets = self.data['Label']
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.text[index])
        target = self.targets[index]

        if target in labels:
            target = labels.index(target)
        else:
            return None  # Skip instances with other labels

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 16
epochs = 5
learning_rate = 2e-5
print(device)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

cuda


In [None]:
def train(model_name,train_set, val_set):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    train_Af = pd.read_csv(train_set)
    val_df = pd.read_csv(val_set)

    train_Aataset = CustomDataset(train_Af, tokenizer, max_length=512)
    val_dataset = CustomDataset(val_df, tokenizer, max_length=512)

    train_loader = DataLoader(train_Aataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    best_val_loss = float('inf')  # Initialize with a large value
    best_model_path = model_name + '.pth'  # File path to save the best model

    # Training loop
    print('Start training...')
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        average_loss = total_loss / len(train_loader)
        # Training progress...

        # Evaluation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                targets = batch['targets'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
                loss = outputs.loss
                val_loss += loss.item()

                _, predicted = torch.max(outputs.logits, dim=1)
                val_total += targets.size(0)
                val_correct += (predicted == targets).sum().item()

        val_accuracy = val_correct / val_total
        val_avg_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {average_loss:.4f} - Validation Loss: {val_avg_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

        # Check if current validation loss is better than the previous best
        if val_avg_loss < best_val_loss:
            best_val_loss = val_avg_loss
            torch.save(model.state_dict(), best_model_path)
            print("Saved the best model!")

    print("Training completed.")

In [None]:
def test(model_name,test_set):
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)
    model.load_state_dict(torch.load(model_name+'.pth'))
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    model.eval()
    test_loss = 0
    test_correct = 0
    test_total = 0

    test_df = pd.read_csv(test_set)
    test_dataset = CustomDataset(test_df, tokenizer, max_length=512)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            test_loss += loss.item()

            _, predicted = torch.max(outputs.logits, dim=1)
            test_total += targets.size(0)
            test_correct += (predicted == targets).sum().item()

    test_accuracy = test_correct / test_total
    test_avg_loss = test_loss / len(test_loader)
    print(f"Test Loss: {test_avg_loss:.4f} - Test Accuracy: {test_accuracy:.4f}")

In [None]:
train('onlyD','train_D.csv','validation_D.csv')

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Start training...
Epoch 1/5 - Training Loss: 1.3912 - Validation Loss: 1.3495 - Validation Accuracy: 0.4198
Saved the best model!
Epoch 2/5 - Training Loss: 1.3385 - Validation Loss: 1.3398 - Validation Accuracy: 0.4506
Saved the best model!
Epoch 3/5 - Training Loss: 1.2728 - Validation Loss: 1.3428 - Validation Accuracy: 0.4554
Epoch 4/5 - Training Loss: 1.1384 - Validation Loss: 1.4364 - Validation Accuracy: 0.4392
Epoch 5/5 - Training Loss: 0.8829 - Validation Loss: 1.6958 - Validation Accuracy: 0.3793
Training completed.


In [None]:
test('onlyD','test_D.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.3037 - Test Accuracy: 0.4595


In [None]:
train('onlyB','train_B.csv','validation_B.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Start training...
Epoch 1/5 - Training Loss: 1.2818 - Validation Loss: 1.2871 - Validation Accuracy: 0.5359
Saved the best model!
Epoch 2/5 - Training Loss: 1.2032 - Validation Loss: 1.2470 - Validation Accuracy: 0.5385
Saved the best model!
Epoch 3/5 - Training Loss: 1.1329 - Validation Loss: 1.2431 - Validation Accuracy: 0.5435
Saved the best model!
Epoch 4/5 - Training Loss: 0.9635 - Validation Loss: 1.3230 - Validation Accuracy: 0.5170
Epoch 5/5 - Training Loss: 0.7613 - Validation Loss: 1.4283 - Validation Accuracy: 0.5183
Training completed.


In [None]:
test('onlyB','test_B.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.2264 - Test Accuracy: 0.5498


In [None]:
test('onlyD','test_B.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.4389 - Test Accuracy: 0.3619


In [None]:
train('onlyAD','train_A_D.csv','validation_A_D.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Start training...
Epoch 1/5 - Training Loss: 1.3156 - Validation Loss: 1.2692 - Validation Accuracy: 0.4728
Saved the best model!
Epoch 2/5 - Training Loss: 1.2434 - Validation Loss: 1.2569 - Validation Accuracy: 0.4692
Saved the best model!
Epoch 3/5 - Training Loss: 1.1573 - Validation Loss: 1.2734 - Validation Accuracy: 0.4880
Epoch 4/5 - Training Loss: 0.9896 - Validation Loss: 1.3588 - Validation Accuracy: 0.4612
Epoch 5/5 - Training Loss: 0.7817 - Validation Loss: 1.5156 - Validation Accuracy: 0.4389
Training completed.


In [None]:
test('onlyAD','test_D.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.2764 - Test Accuracy: 0.4676


In [None]:
test('onlyAD','test_B.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.2708 - Test Accuracy: 0.5233


In [None]:
test('onlyAD','test_A_D.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.2577 - Test Accuracy: 0.5076


In [None]:
train('all','train_all.csv','validation_all.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Start training...
Epoch 1/5 - Training Loss: 1.3192 - Validation Loss: 1.3285 - Validation Accuracy: 0.4769
Saved the best model!
Epoch 2/5 - Training Loss: 1.2557 - Validation Loss: 1.2938 - Validation Accuracy: 0.4734
Saved the best model!
Epoch 3/5 - Training Loss: 1.1483 - Validation Loss: 1.2792 - Validation Accuracy: 0.4848
Saved the best model!
Epoch 4/5 - Training Loss: 0.9734 - Validation Loss: 1.3176 - Validation Accuracy: 0.4856
Epoch 5/5 - Training Loss: 0.8025 - Validation Loss: 1.4195 - Validation Accuracy: 0.4935
Training completed.


In [None]:
test('all','test_D.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.1954 - Test Accuracy: 0.4854


In [None]:
test('all','test_all.csv')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.

Test Loss: 1.2853 - Test Accuracy: 0.4899
