In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
from PyPDF2 import PdfReader
import os

# Path to the dataset folder in Kaggle
dataset_folder = '/kaggle/input/kdsh-datasetheadx/KDSH Dataset'

# List all PDF files in the folder
pdf_files = [file for file in os.listdir(dataset_folder) if file.endswith('.pdf')]
ds_unclassified = [file for file in os.listdir(dataset_folder) if file.endswith('.pdf') and file.startswith('P')]
ds_publishnot = [file for file in pdf_files if 'R001' <= file <= 'R006']
ds_publish = [file for file in pdf_files if 'R006' <= file <= 'R016']

In [3]:
print(ds_publishnot)
print(ds_publish)

['R003.pdf', 'R005.pdf', 'R002.pdf', 'R004.pdf', 'R001.pdf']
['R015.pdf', 'R010.pdf', 'R012.pdf', 'R008.pdf', 'R011.pdf', 'R007.pdf', 'R009.pdf', 'R013.pdf', 'R014.pdf', 'R006.pdf']


In [4]:
import pandas as pd
import re
import os
from PyPDF2 import PdfReader

# Initialize a list to store the extracted data
data = []

# Extract required information from each PDF
for pdf_file in pdf_files:
    file_path = os.path.join(dataset_folder, pdf_file)
    reader = PdfReader(file_path)
    
    # Extract text from all pages of the PDF
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()
    
    # Extract information
    pdf_name = pdf_file
    title = full_text.split('\n')[0]  # Assume the title is the first line
    abstract_start = full_text.lower().find("abstract")
    intro_start = full_text.lower().find("1 introduction")
    abstract = full_text[abstract_start:intro_start].strip() if abstract_start != -1 and intro_start != -1 else "N/A"
    
    # Define the pattern for Section 2
    pattern2 = r"2\s[A-Z][a-zA-Z\s]+\n"
    match = re.search(pattern2, full_text)
    introend_search = match.start() if match else -1
    introduction_end = introend_search
    introduction = full_text[intro_start:introduction_end].strip() if intro_start != -1 else "N/A"
    
    # Search for Section 2
    section2_start = introduction_end
    pattern3 = r"3\s[A-Z][a-zA-Z\s]+\n"
    match3 = re.search(pattern3, full_text)
    section2end_search = match3.start() if match3 else -1
    section2_end = section2end_search
    section2 = full_text[section2_start:section2_end].strip() if section2_start != -1 else "N/A"

    # Search for Section 3
    section3_start = section2_end
    pattern4 = r"4\s[A-Z][a-zA-Z\s]+\n"
    match4 = re.search(pattern4, full_text)
    section3end_search = match4.start() if match4 else -1
    section3_end = section3end_search
    section3 = full_text[section3_start:section3_end].strip() if section3_start != -1 else "N/A"
    
    # Extract the rest of the paper
    rest_of_paper = full_text[section3_end:].strip() if section3_end != -1 else "N/A"

    # Append the extracted information to the list
    data.append([pdf_name, title, abstract, introduction, section2, section3, rest_of_paper])

# Create a DataFrame
columns = ["PDF Name", "Title of Paper", "Abstract", "Introduction", "Section 2", "Section 3", "Rest of the Paper"]
df = pd.DataFrame(data, columns=columns)
df = df.replace(["N/A", ""], "nill")
# Display the dataset
print(df)


     PDF Name                                     Title of Paper  \
0    P063.pdf  Representation Transferability in Neural Networks   
1    P038.pdf  Utilizing Graph Neural Networks to Analyze Esp...   
2    P119.pdf  Entropy Dynamics in Turbulent Flumplenook Systems   
3    P071.pdf  The Significance of Fillers in Textual Represe...   
4    P020.pdf  Deep Learning for 3D Protein Structure Predict...   
..        ...                                                ...   
145  P050.pdf  Interpreting Recurrent and Attention-Based Neural   
146  P134.pdf     Unraveling the Enigmatic Parallels Between DNA   
147  P101.pdf          A Convolutional LSTM Network Approach for   
148  P073.pdf  Exploring Soil Dynamics through a Multidiscipl...   
149  P096.pdf  Volcanic Eruptions in Relation to Quiche Recip...   

                                              Abstract  \
0    Abstract\nDeep neural networks, which are buil...   
1    Abstract\nGraph Neural Networks (GNNs) for Pre...   
2    Abst

In [5]:
df.tail(20)

Unnamed: 0,PDF Name,Title of Paper,Abstract,Introduction,Section 2,Section 3,Rest of the Paper
130,P032.pdf,Exploring the Transcendental Nexus of Water and,Abstract\nThe aquatic nuances of water travers...,1 Introduction\nIn order to fully grasp the im...,2 Related Work\nThe notion of water as a fluid...,3 Methodology\nThe investigation of water nece...,4 Experiments\nThe initialization of our resea...
131,P088.pdf,Analyzing Groups of Neurons in Neural Networks...,"Abstract\nThe concept of a ""modular"" structure...","1 Introduction\nModularity, a principle where ...",2 Related Work\nThe investigation of modularit...,3 Quantifying modularity by clustering similar...,4 Experiments\n4.1 Setup and initial hypothese...
132,P111.pdf,Leveraging Deep Learning for Enhanced Bayesian...,Abstract\nBayesian optimization (BO) is a wide...,1 Introduction\nBayesian optimization (BO) is ...,2 Related Work\nSeveral methods have been deve...,3 Methodology\n3.1 Bayesian Optimization Prere...,"4 Auxiliary Information\nTypically, we assume ..."
133,P039.pdf,RAG Optimization via Galactic Kitten Dynamics and,Abstract\nInvestigating RAG necessitates scrut...,1 Introduction\nRAG is a phenomenon that has b...,2 Related Work\nThe inherent properties of gal...,3 Methodology\nIn order to facilitate a compre...,4 Experiments\nIn order to facilitate a compre...
134,P102.pdf,A Large-Scale Car Dataset for Fine-Grained,Abstract\nThis paper aims to highlight vision ...,1 Introduction\nCars represent a revolution in...,2 Related Work\nMost previous car model resear...,3 Properties of CompCars\nThe CompCars dataset...,"4 Applications\nIn this section, we study thre..."
135,P007.pdf,Joint Syntacto-Discourse Parsing and the,Abstract\nDiscourse parsing has long been trea...,1 Introduction\nDistinguishing the semantic re...,nill,3 Recurrent Neural Models and Training\nThe sc...,4 Experiments\nWe use the treebank described i...
136,P085.pdf,Privacy Evaluation in Tabular Synthetic Data:,Abstract\nThis paper examines the present meth...,1 Introduction and Relation to Prior Research\...,2 Definitions and Notation\nTo the best of our...,3 Synthetic Data Privacy Risks\nThree signific...,4 Mathematical Privacy Properties\n4.1 Differe...
137,P053.pdf,Microprocessor Architectures and their Interse...,Abstract\nMicroprocessors have been profoundly...,1 Introduction\nThe intersection of microproce...,2 Related Work\nThe advent of microprocessor t...,3 Methodology\nThe elucidation of microprocess...,4 Experiments\nThe experimental design for thi...
138,P036.pdf,Profound Impact on Gravity on the Surface of a,Abstract\nThe study of gravity necessitates a ...,1 Introduction\nThe complexity of gravity and ...,2 Related Work\nThe concept of gravity has bee...,3 Methodology\nTo initiate our inquiry into th...,4 Experiments\nThe notion of gravity was first...
139,P046.pdf,Symbiotic Adversarial Robustness for Graph Neural,Abstract\nDeep learning models are known to be...,1 Introduction\nGraph neural networks (GNNs) a...,2 Preliminaries\nNotation. We denote a graph b...,3 Symbiotic Attacks\nThe Symbiotic Objective. ...,4 Evaluation\n4.1 Setup\nWe compare the symbio...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PDF Name           150 non-null    object
 1   Title of Paper     150 non-null    object
 2   Abstract           150 non-null    object
 3   Introduction       150 non-null    object
 4   Section 2          150 non-null    object
 5   Section 3          150 non-null    object
 6   Rest of the Paper  150 non-null    object
dtypes: object(7)
memory usage: 8.3+ KB


In [7]:
print(df["Introduction"])

0      1 Introduction\nDeep networks, constructed wit...
1      1 Introduction\nThe realm of Graph Neural Netw...
2      1 Introduction\nThe notion of entropy, a conce...
3      1 Introduction\nThis paper addresses the criti...
4      1 Introduction\nThe prediction of 3D protein s...
                             ...                        
145    1 Introduction\nDeep learning has achieved tre...
146    1 Introduction\nThe intersection of quantum me...
147    1 Introduction\nThis paper addresses the criti...
148    1 Introduction\nThe fledgling discipline of so...
149    1 Introduction\nThe ostensibly unrelated field...
Name: Introduction, Length: 150, dtype: object


In [8]:
import pandas as pd

# Function to equally divide text into two parts
def equally_split_rest(text):
    if not isinstance(text, str) or not text.strip():  # Check for valid text
        return "nil", "nil"  # Return "nil" for both parts if text is invalid

    words = text.split()  # Split text into words
    half_length = len(words) // 2  # Find half the length of words
    
    part1 = " ".join(words[:half_length])  # First half
    part2 = " ".join(words[half_length:])  # Second half

    return part1, part2

df[["Rest Part 1", "Rest Part 2"]] = df["Rest of the Paper"].apply(equally_split_rest).apply(pd.Series)

<h1>Making the dataframes for both classified and non-classified</h1>

In [9]:
df['Rest Part 1'] = df['Rest Part 1'].replace('', 'nill')

In [10]:
df_classified = df[df["PDF Name"].str.startswith("R")]
df_nonClassified=df[df["PDF Name"].str.startswith("P")]

In [11]:
df_classified.shape

(15, 9)

In [12]:
df_nonClassified.shape

(135, 9)

<h2>Labeling the non-publishable or publishable</h2>

In [13]:
# Explicitly set the 'Label' column using .loc[]
df_classified.loc[:, 'Label'] = df_classified['PDF Name'].apply(lambda x: 0 if int(x[1:4]) <= 5 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_classified.loc[:, 'Label'] = df_classified['PDF Name'].apply(lambda x: 0 if int(x[1:4]) <= 5 else 1)


In [14]:
df_classified.head(20)

Unnamed: 0,PDF Name,Title of Paper,Abstract,Introduction,Section 2,Section 3,Rest of the Paper,Rest Part 1,Rest Part 2,Label
17,R003.pdf,Deciphering the Enigmatic Properties of Metals,Abstract\nMetamorphosis of galvanic oscillatio...,1 Introduction\nThe dialectical nuances of met...,2 Related Work\nThe notion of metals has been ...,3 Methodology\nThe investigation of metals nec...,4 Experiments\nThe methodologies employed in t...,4 Experiments The methodologies employed in th...,has also explored the realm of metal-based art...,0
19,R015.pdf,Examining the Convergence of Denoising Diffusi...,"Abstract\nDeep generative models, particularly...","1 Introduction\nDiffusion models, alongside ge...","2 Our contributions\nIn this study, strong ass...",3 Our Approach\nThe goal is to upper-bound the...,nill,nill,nill,1
22,R005.pdf,Analyzing Real-Time Group Coordination in,Abstract\nThe convergence of augmented reality...,1 Introduction\nThe realm of coordinated dance...,2 Related Work\nThe intersection of augmented ...,3 Methodology\nTo investigate the relationship...,4 Experiments\nTo conduct a comprehensive eval...,4 Experiments To conduct a comprehensive evalu...,the development of more sophisticated AR syste...,0
43,R010.pdf,Detecting Medication Usage in Parkinson’s Dise...,Abstract\nParkinson’s disease (PD) is a progre...,1 Introduction\nParkinson’s disease (PD) is a ...,2 Related Work\nExtensive research has utilize...,3 Cohort and Dataset\n**Dataset:** This datase...,4 Methodologies and Framework\nWe introduce Mu...,4 Methodologies and Framework We introduce Mul...,"as in 4m-HC and 4m-PD validations, having extr...",1
56,R012.pdf,Safe Predictors for Input-Output Specification,Abstract\nThis paper presents an approach for ...,1 Introduction\nThe increasing adoption of mac...,2 Method\nConsidering two normed vector spaces...,3 Application to Aircraft Collision Avoidance\...,4 Discussion and Future Work\nWe propose an ap...,4 Discussion and Future Work We propose an app...,scores. The data is split using an 80/20 train...,1
60,R002.pdf,Synergistic Convergence of Photosynthetic Path...,Abstract\nThe perpetual oscillations of quantu...,1 Introduction\nThe deployment of novel spectr...,2 Related Work\nThe process of photosynthesis ...,3 Methodology\nThe intricacies of photosynthet...,4 Experiments\nThe controlled environment of t...,4 Experiments The controlled environment of th...,tons of Jell-O from the laboratory cafeteria. ...,0
83,R008.pdf,Advanced techniques for through and contextually,Abstract\nThis study examines the effectivenes...,1 Introduction\nNoun-noun compound interpretat...,2 Related Work\nApproaches to interpreting nou...,3 Task Definition and Dataset\nThe objective o...,nill,nill,nill,1
86,R004.pdf,AI-Driven Personalization in Online Education,Abstract\nAI-driven personalization is revolut...,1 Introduction\nThe advent of online education...,2 Related Work\nAI-driven personalization in o...,3 Methodology\nTo develop an AI-driven persona...,4 Experiments\nTo investigate the efficacy of ...,4 Experiments To investigate the efficacy of e...,This phenomenon was observed when the packagin...,0
91,R011.pdf,Addressing Popularity Bias with Popularity-Con...,Abstract\nCollaborative Filtering (CF) often e...,1 Introduction\nContemporary recommender syste...,2 Methodology\nTo overcome the current difficu...,3 Model Optimization\nTo reduce popularity bia...,4 Debias Ability\nTo further verify the effect...,4 Debias Ability To further verify the effecti...,preliminary results indicate that both extreme...,1
99,R007.pdf,Advancements in 3D Food Modeling: A Review of the,Abstract\nThe growing focus on leveraging comp...,1 Introduction\nThe convergence of computer vi...,2 Related Work\nEstimating food portions is a ...,3 Datasets and Evaluation Pipeline\n3.1 Datase...,nill,nill,nill,1


<h2>Finding length of strings</h2>

In [15]:
print("Abstract: Average =", df['Abstract'].str.split().apply(len).mean(), 
      ", Lowest =", df['Abstract'].str.split().apply(len).min(), 
      ", Highest =", df['Abstract'].str.split().apply(len).max())

print("Introduction: Average =", df['Introduction'].str.split().apply(len).mean(), 
      ", Lowest =", df['Introduction'].str.split().apply(len).min(), 
      ", Highest =", df['Introduction'].str.split().apply(len).max())

print("Section 2: Average =", df['Section 2'].str.split().apply(len).mean(), 
      ", Lowest =", df['Section 2'].str.split().apply(len).min(), 
      ", Highest =", df['Section 2'].str.split().apply(len).max())

print("Section 3: Average =", df['Section 3'].str.split().apply(len).mean(), 
      ", Lowest =", df['Section 3'].str.split().apply(len).min(), 
      ", Highest =", df['Section 3'].str.split().apply(len).max())

print("Rest of the Paper: Average =", df['Rest of the Paper'].str.split().apply(len).mean(), 
      ", Lowest =", df['Rest of the Paper'].str.split().apply(len).min(), 
      ", Highest =", df['Rest of the Paper'].str.split().apply(len).max())


Abstract: Average = 143.20666666666668 , Lowest = 21 , Highest = 401
Introduction: Average = 703.62 , Lowest = 45 , Highest = 2129
Section 2: Average = 768.4333333333333 , Lowest = 1 , Highest = 3646
Section 3: Average = 836.96 , Lowest = 1 , Highest = 4151
Rest of the Paper: Average = 1995.48 , Lowest = 1 , Highest = 8936


In [16]:
print("Abstract:", max(df['Abstract'].str.split().apply(len)))
print("Introduction:", max(df['Introduction'].str.split().apply(len)))
print("Section 2:", max(df['Section 2'].str.split().apply(len)))
print("Section 3:", max(df['Section 3'].str.split().apply(len)))
print("Rest Part 1:", max(df['Rest Part 1'].str.split().apply(len)))
print("Rest Part 2:", max(df['Rest Part 2'].str.split().apply(len)))

Abstract: 401
Introduction: 2129
Section 2: 3646
Section 3: 4151
Rest Part 1: 4468
Rest Part 2: 4468


<h2>First summarize then apply in BERT model</h2>

In [17]:
!pip install transformers torch



In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PDF Name           150 non-null    object
 1   Title of Paper     150 non-null    object
 2   Abstract           150 non-null    object
 3   Introduction       150 non-null    object
 4   Section 2          150 non-null    object
 5   Section 3          150 non-null    object
 6   Rest of the Paper  150 non-null    object
 7   Rest Part 1        150 non-null    object
 8   Rest Part 2        150 non-null    object
dtypes: object(9)
memory usage: 10.7+ KB


<h3>Creating new dataframes for each pair</h3>

In [19]:
# Assuming df_classified already has 'Abstract' and 'Label' columns
df_abstract = pd.DataFrame()
df_abstract['Abstract'] = df_classified['Abstract']
df_abstract['Label'] = df_classified['Label']

In [20]:
df_abstract.head()

Unnamed: 0,Abstract,Label
17,Abstract\nMetamorphosis of galvanic oscillatio...,0
19,"Abstract\nDeep generative models, particularly...",1
22,Abstract\nThe convergence of augmented reality...,0
43,Abstract\nParkinson’s disease (PD) is a progre...,1
56,Abstract\nThis paper presents an approach for ...,1


In [21]:
df_intro = pd.DataFrame()
df_intro['Introduction'] = df_classified['Introduction']
df_intro['Label'] = df_classified['Label']

In [22]:
df_sec2 = pd.DataFrame()
df_sec2['Section 2'] = df_classified['Section 2']
df_sec2['Label'] = df_classified['Label']

In [23]:
df_sec3 = pd.DataFrame()
df_sec3['Section 3'] = df_classified['Section 3']
df_sec3['Label'] = df_classified['Label']

In [24]:
df_restp1 = pd.DataFrame()
df_restp1['Rest Part 1'] = df_classified['Rest Part 1']
df_restp1['Label'] = df_classified['Label']

In [25]:
df_restp2 = pd.DataFrame()
df_restp2['Rest Part 2'] = df_classified['Rest Part 2']
df_restp2['Label'] = df_classified['Label']

<h2>df_abstract</h2>
<h2>df_intro</h2>
<h2>df_sec2</h2>
<h2>df_sec3</h2>
<h2>df_restp1</h2>
<h2>df_restp2</h2>

<h1>Implementing the models</h1>

In [26]:
!pip install -U accelerate
!pip install -U transformers

Collecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.2.1-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
Successfully installed accelerate-1.2.1
Collecting transformers
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.48.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [27]:
import torch
from transformers import AutoTokenizer
from transformers import BigBirdPegasusForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [28]:
# checkpoint = "google/bigbird-pegasus-large-arxiv"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = BigBirdPegasusForSequenceClassification.from_pretrained(
#     checkpoint,
#     num_labels=2,
#     problem_type="binary_classification"
# )

In [29]:
# Assuming df_abstract is your DataFrame with two columns: 'text' and 'label'
texts = df_abstract['Abstract'].tolist()  # Extract the 'text' column
labels = df_abstract['Label'].tolist()  # Extract the 'label' column

In [30]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels,
                                                                    test_size=0.2, random_state=42)

In [31]:
# # Ensure the labels in the dataset are `torch.long`
# class CustomDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_len=4096):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = str(self.texts[idx])
#         label = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure labels are `torch.long`

#         encoding = self.tokenizer(
#             text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt'
#         )

#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'labels': label
#         }

In [32]:
# train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
# val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [33]:
# import numpy as np
# from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
# from transformers import EvalPrediction
# import torch


# def binary_classification_metrics(predictions, labels, threshold=0.5):
#     sigmoid = torch.nn.Sigmoid()
#     probs = sigmoid(torch.Tensor(predictions))  # Apply Sigmoid to get probabilities

#     # Convert probabilities to binary predictions based on the threshold
#     y_pred = (probs >= threshold).float()
#     y_true = labels

#     # Calculate metrics
#     f1 = f1_score(y_true, y_pred, average='binary')  # F1 score for binary classification
#     roc_auc = roc_auc_score(y_true, y_pred)  # ROC AUC score for binary classification
#     hamming = hamming_loss(y_true, y_pred)  # Hamming loss

#     # Return metrics
#     metrics = {
#         "roc_auc": roc_auc,
#         "hamming_loss": hamming,
#         "f1": f1
#     }

#     return metrics


# def compute_metrics(p: EvalPrediction):
#     preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

#     result = binary_classification_metrics(predictions=preds,
#                                            labels=p.label_ids)

#     return result


In [34]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(model.device)

In [35]:
# # Training Arguments
# from transformers import TrainingArguments, Trainer

# # Define TrainingArguments and Trainer as before
# args = TrainingArguments(
#     per_device_train_batch_size=3,
#     per_device_eval_batch_size=3,
#     output_dir='./results',
#     num_train_epochs=2,
#     save_steps=1000,
#     save_total_limit=2,
#     logging_dir='./logs',
#     logging_steps=10
# )

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics
# )


In [36]:
# trainer.train()

In [37]:
# trainer.evaluate()

In [38]:
import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [39]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=2,
                                                            problem_type="binary_classification")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
# Lets build custom dataset
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }

In [41]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [42]:
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def binary_classification_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))  # Apply Sigmoid to get probabilities

    # Convert probabilities to binary predictions based on the threshold
    y_pred = (probs >= threshold).float()
    y_true = labels

    # Calculate metrics
    f1 = f1_score(y_true, y_pred, average='binary')  # F1 score for binary classification
    roc_auc = roc_auc_score(y_true, y_pred)  # ROC AUC score for binary classification
    hamming = hamming_loss(y_true, y_pred)  # Hamming loss

    # Return metrics
    metrics = {
        "roc_auc": roc_auc,
        "hamming_loss": hamming,
        "f1": f1
    }

    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    result = binary_classification_metrics(predictions=preds,
                                           labels=p.label_ids)

    return result


In [43]:
# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

In [44]:
# trainer.train()

In [45]:
# trainer.evaluate()