In [2]:
from transformers import BertTokenizer
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts, max_length=512):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )

In [4]:
import pandas as pd 

csv_file_path = r"C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\updated_publishable_data.csv"
df = pd.read_csv(csv_file_path)
texts = df['PDF'].tolist() 
labels = df['Label'].tolist()

labels_tensor = tf.convert_to_tensor(labels)

In [5]:
tokenized_data = tokenize_texts(texts)

In [6]:

input_ids, attention_masks = tokenized_data['input_ids'], tokenized_data['attention_mask']

In [7]:
X_train = (input_ids, attention_masks)
Y_train = labels_tensor
print(input_ids.shape)
print(attention_masks.shape)

(15, 512)
(15, 512)


In [8]:
def split_into_chunks(text, tokenizer, max_len=512, stride=256):
    
    tokens = tokenizer.encode(text, truncation=False, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_len]
        chunk = tokenizer.build_inputs_with_special_tokens(chunk)  
        chunks.append(chunk)
        if len(chunk) < max_len:
            break
    return chunks

In [9]:
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf
from tensorflow.keras import layers, Model

class BertLayer(tf.keras.layers.Layer):
    def __init__(self, bert_model_name='bert-base-uncased', **kwargs):
        super().__init__(**kwargs)
        self.bert = TFBertModel.from_pretrained(bert_model_name)

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # Use pooled output for classification tasks

input_ids_layer = tf.keras.Input(shape=(512,), dtype=tf.int32, name='input_ids')
attention_mask_layer = tf.keras.Input(shape=(512,), dtype=tf.int32, name='attention_mask')

bert_output = BertLayer()(inputs=[input_ids_layer, attention_mask_layer])

output = layers.Dense(1, activation='sigmoid')(bert_output)

model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [10]:
from tensorflow.keras.callbacks import Callback, LearningRateScheduler
import tensorflow as tf

class CustomCallback(Callback):
    def __init__(self, log):
        super().__init__()
        self.log = log

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        accuracy = logs.get("accuracy")
        self.log[epoch] = logs  
        if accuracy is not None and accuracy >= 0.90:
            print(f"Accuracy reached {accuracy:.2f} at epoch {epoch + 1}.")
            self.model.stop_training = True


def scheduler(epoch, lr):
    if epoch > 0 and epoch % 10 == 0:
        return lr * 0.1
    return lr

lr_scheduler = LearningRateScheduler(scheduler, verbose=1)



In [11]:
log = {}  # Initialize the log
custom_callback = CustomCallback(log)
history = model.fit(
    x={'input_ids': X_train[0], 'attention_mask': X_train[1]},
    y=Y_train,
    epochs=20,
    batch_size=4,
    callbacks=[custom_callback, lr_scheduler]
)




Epoch 1: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 713ms/step - accuracy: 0.6083 - loss: 0.7295 - learning_rate: 0.0010

Epoch 2: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 685ms/step - accuracy: 0.7867 - loss: 0.5458 - learning_rate: 0.0010

Epoch 3: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 683ms/step - accuracy: 0.7950 - loss: 0.5212 - learning_rate: 0.0010

Epoch 4: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 690ms/step - accuracy: 0.7700 - loss: 0.4671 - learning_rate: 0.0010

Epoch 5: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 5/20
[1m4/4[0m [32m━━━━

In [12]:
prediction = model.predict({'input_ids': X_train[0], 'attention_mask': X_train[1]})

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step


In [13]:
print(prediction.shape)

(15, 1)


In [14]:
import pandas as pd
csv_file_path = r"C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\updated_publishable_data.csv" 
df = pd.read_csv(csv_file_path)

df.head()

Unnamed: 0,ID,PDF,Label,Conference,Rationale
0,R007.pdf,Advancements in 3D Food Modeling: A Review of ...,1,CVPR,The MetaFood Workshop Challenge focuses on int...
1,R011.pdf,Addressing Popularity Bias with Popularity-Con...,1,KDD,Contemporary recommender systems face populari...
2,R002.pdf,Synergistic Convergence of Photosynthetic Path...,0,Non Publishable,This experimental research employs whimsical m...
3,R009.pdf,The Importance of Written Explanations in\nAgg...,1,EMNLP,"A study of the ""wisdom of the crowd"""
4,R004.pdf,AI-Driven Personalization in Online Education\...,0,Non Publishable,We propose an unconventional method for incorp...


In [15]:
df['predicted_label'] = prediction.tolist()

In [16]:
df

Unnamed: 0,ID,PDF,Label,Conference,Rationale,predicted_label
0,R007.pdf,Advancements in 3D Food Modeling: A Review of ...,1,CVPR,The MetaFood Workshop Challenge focuses on int...,[0.7942856550216675]
1,R011.pdf,Addressing Popularity Bias with Popularity-Con...,1,KDD,Contemporary recommender systems face populari...,[0.7322335243225098]
2,R002.pdf,Synergistic Convergence of Photosynthetic Path...,0,Non Publishable,This experimental research employs whimsical m...,[0.484770804643631]
3,R009.pdf,The Importance of Written Explanations in\nAgg...,1,EMNLP,"A study of the ""wisdom of the crowd""",[0.8507500886917114]
4,R004.pdf,AI-Driven Personalization in Online Education\...,0,Non Publishable,We propose an unconventional method for incorp...,[0.6077845692634583]
5,R010.pdf,Detecting Medication Usage in Parkinson’s Dise...,1,KDD,A transformer-based method for indoor localiza...,[0.8016510605812073]
6,R015.pdf,Examining the Convergence of Denoising Diffusi...,1,TMLR,The results of a deep generative model are bas...,[0.8603323698043823]
7,R003.pdf,Deciphering the Enigmatic Properties of Metals...,0,Non Publishable,"The ephemeral and the mundane, as the luminesc...",[0.07095776498317719]
8,R005.pdf,Analyzing Real-Time Group Coordination in\nAug...,0,Non Publishable,AR enhances synchronization and fostering gest...,[0.537226676940918]
9,R014.pdf,Addressing Min-Max Challenges in Nonconvex-Non...,1,TMLR,We propose a new convergence method for saddle...,[0.8397592306137085]


In [17]:
df['predicted_label'] = df['predicted_label'].apply(lambda x:x[0])
df['status'] = df['predicted_label'].apply(lambda x: 'Publishable' if x > 0.5 else 'Non-publishable')

df['predicted_label'] = df['predicted_label'].apply(lambda x: 1 if x > 0.5 else 0)

df


Unnamed: 0,ID,PDF,Label,Conference,Rationale,predicted_label,status
0,R007.pdf,Advancements in 3D Food Modeling: A Review of ...,1,CVPR,The MetaFood Workshop Challenge focuses on int...,1,Publishable
1,R011.pdf,Addressing Popularity Bias with Popularity-Con...,1,KDD,Contemporary recommender systems face populari...,1,Publishable
2,R002.pdf,Synergistic Convergence of Photosynthetic Path...,0,Non Publishable,This experimental research employs whimsical m...,0,Non-publishable
3,R009.pdf,The Importance of Written Explanations in\nAgg...,1,EMNLP,"A study of the ""wisdom of the crowd""",1,Publishable
4,R004.pdf,AI-Driven Personalization in Online Education\...,0,Non Publishable,We propose an unconventional method for incorp...,1,Publishable
5,R010.pdf,Detecting Medication Usage in Parkinson’s Dise...,1,KDD,A transformer-based method for indoor localiza...,1,Publishable
6,R015.pdf,Examining the Convergence of Denoising Diffusi...,1,TMLR,The results of a deep generative model are bas...,1,Publishable
7,R003.pdf,Deciphering the Enigmatic Properties of Metals...,0,Non Publishable,"The ephemeral and the mundane, as the luminesc...",0,Non-publishable
8,R005.pdf,Analyzing Real-Time Group Coordination in\nAug...,0,Non Publishable,AR enhances synchronization and fostering gest...,1,Publishable
9,R014.pdf,Addressing Min-Max Challenges in Nonconvex-Non...,1,TMLR,We propose a new convergence method for saddle...,1,Publishable


In [18]:
updated_csv_file = r"C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\updated_csv_file_with_classification.csv"
df.to_csv(updated_csv_file,index=False)
print(f"Csv file successfully saved at --->{updated_csv_file}")

Csv file successfully saved at --->C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\updated_csv_file_with_classification.csv


In [19]:
test_file_path = r"C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\Papers\Papers"

import PyPDF2

def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text


In [20]:
import os
texts = []
pdf_names = []

for root,dirs,files in os.walk(test_file_path):
    for file in files:
        file_path = os.path.join(root,file)
        pdf_names.append(file)
        text = extract_text_from_pdf(file_path)
        texts.append(text)
        
print(f"Text perfectly saved at texts and names also")

Text perfectly saved at texts and names also


In [21]:
new_df = pd.DataFrame({'PDF':pdf_names,
                       'Text':texts})
new_df

Unnamed: 0,PDF,Text
0,P001.pdf,Leveraging Clustering Techniques for Enhanced\...
1,P002.pdf,Virus Propagation and their Far-Reaching\nImpl...
2,P003.pdf,Explainable Reinforcement Learning for Financi...
3,P004.pdf,Graph Neural Networks Without Training: Harnes...
4,P005.pdf,Collaborative Clothing Segmentation and\nIdent...
...,...,...
130,P131.pdf,Enhancing Disentanglement through Learned\nAgg...
131,P132.pdf,Analyzing Fermentation Patterns with Multi-Mod...
132,P133.pdf,Discontinuous Constituent Parsing as Sequence\...
133,P134.pdf,Unraveling the Enigmatic Parallels Between DNA...


In [22]:
test_texts = new_df['Text'].tolist()
tokenized_test_data = tokenize_texts(test_texts)

In [23]:

test_input_ids, test_attention_masks = tokenized_test_data['input_ids'], tokenized_test_data['attention_mask']

In [24]:
X_test = (test_input_ids, test_attention_masks)
prediction_test = model.predict({'input_ids': X_test[0], 'attention_mask': X_test[1]})


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 5s/step


In [25]:
new_df['predicted_label'] = prediction_test.tolist()
new_df['predicted_label'] = new_df['predicted_label'].apply(lambda x:x[0])
new_df['status'] = new_df['predicted_label'].apply(lambda x: 'Publishable' if x>=0.5 else 'Non-Publishable')
new_df['predicted_label'] = new_df['predicted_label'].apply(lambda x: 1 if x>=0.5 else 0)
new_df

Unnamed: 0,PDF,Text,predicted_label,status
0,P001.pdf,Leveraging Clustering Techniques for Enhanced\...,1,Publishable
1,P002.pdf,Virus Propagation and their Far-Reaching\nImpl...,0,Non-Publishable
2,P003.pdf,Explainable Reinforcement Learning for Financi...,1,Publishable
3,P004.pdf,Graph Neural Networks Without Training: Harnes...,1,Publishable
4,P005.pdf,Collaborative Clothing Segmentation and\nIdent...,1,Publishable
...,...,...,...,...
130,P131.pdf,Enhancing Disentanglement through Learned\nAgg...,1,Publishable
131,P132.pdf,Analyzing Fermentation Patterns with Multi-Mod...,0,Non-Publishable
132,P133.pdf,Discontinuous Constituent Parsing as Sequence\...,1,Publishable
133,P134.pdf,Unraveling the Enigmatic Parallels Between DNA...,1,Publishable


In [26]:
no_of_nonPublishable_paper = (new_df['predicted_label'] == 0).sum()
print("total no. of non publishable paper:",no_of_nonPublishable_paper)

total no. of non publishable paper: 47


In [27]:
test_csv_file = r"C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\test_dataframe.csv"
new_df.to_csv(test_csv_file)
print(f"Test csv file successfully transforted to {test_csv_file}")

Test csv file successfully transforted to C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\test_dataframe.csv


In [28]:
test_with_rationale_csv = r"C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\test_with_rationale.csv"
test_df = pd.read_csv(test_with_rationale_csv)
test_df


Unnamed: 0.1,Unnamed: 0,PDF,Text,predicted_label,status,Rationale
0,0,P001.pdf,Leveraging Clustering Techniques for Enhanced\...,1,Publishable,A clustering-based learning detection strategy...
1,1,P002.pdf,Virus Propagation and their Far-Reaching\nImpl...,0,Non-Publishable,The cellular mechanisms underlying viral repli...
2,2,P003.pdf,Explainable Reinforcement Learning for Financi...,1,Publishable,A new approach to financial market simulation ...
3,3,P004.pdf,Graph Neural Networks Without Training: Harnes...,1,Publishable,GNNs are a reliable yet relatively unexplored ...
4,4,P005.pdf,Collaborative Clothing Segmentation and\nIdent...,1,Publishable,The system is a novel image co-segmentation sy...
...,...,...,...,...,...,...
130,130,P131.pdf,Enhancing Disentanglement through Learned\nAgg...,1,Publishable,We use the implicit inductive bias in ImageNet...
131,131,P132.pdf,Analyzing Fermentation Patterns with Multi-Mod...,0,Non-Publishable,A new generative model of sourdough bread has ...
132,132,P133.pdf,Discontinuous Constituent Parsing as Sequence\...,1,Publishable,Discontinuous constituent parsing is a method ...
133,133,P134.pdf,Unraveling the Enigmatic Parallels Between DNA...,1,Publishable,The Flumplenook hypothesis is a theoretical fr...


In [29]:
from categorical_model_load import categorical_model
prediction_categorical_test = categorical_model.predict({'input_ids': X_test[0], 'attention_mask': X_test[1]})


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 5s/step


In [30]:
print(prediction_categorical_test.shape)

(135, 6)


In [31]:
final_prediction = tf.argmax(prediction_categorical_test,axis=1)
print(final_prediction.shape)

(135,)


In [33]:
mapping = {
    0: 'Null',       
    1: 'TMLR',
    2: 'EMNLP',
    3: 'NeurIPS',
    4: 'KDD',
    5: 'CVPR'
}
final_labels = [mapping[prediction.numpy()] for prediction in final_prediction]

In [34]:
test_df['Conference'] = final_labels
test_df

Unnamed: 0.1,Unnamed: 0,PDF,Text,predicted_label,status,Rationale,Conference
0,0,P001.pdf,Leveraging Clustering Techniques for Enhanced\...,1,Publishable,A clustering-based learning detection strategy...,CVPR
1,1,P002.pdf,Virus Propagation and their Far-Reaching\nImpl...,0,Non-Publishable,The cellular mechanisms underlying viral repli...,Null
2,2,P003.pdf,Explainable Reinforcement Learning for Financi...,1,Publishable,A new approach to financial market simulation ...,Null
3,3,P004.pdf,Graph Neural Networks Without Training: Harnes...,1,Publishable,GNNs are a reliable yet relatively unexplored ...,NeurIPS
4,4,P005.pdf,Collaborative Clothing Segmentation and\nIdent...,1,Publishable,The system is a novel image co-segmentation sy...,CVPR
...,...,...,...,...,...,...,...
130,130,P131.pdf,Enhancing Disentanglement through Learned\nAgg...,1,Publishable,We use the implicit inductive bias in ImageNet...,CVPR
131,131,P132.pdf,Analyzing Fermentation Patterns with Multi-Mod...,0,Non-Publishable,A new generative model of sourdough bread has ...,Null
132,132,P133.pdf,Discontinuous Constituent Parsing as Sequence\...,1,Publishable,Discontinuous constituent parsing is a method ...,NeurIPS
133,133,P134.pdf,Unraveling the Enigmatic Parallels Between DNA...,1,Publishable,The Flumplenook hypothesis is a theoretical fr...,CVPR


In [36]:
final_csv_file = r"C:\Users\Debajyoti\OneDrive\Desktop\project task-1\data\final_result.csv"
test_df.to_csv(final_csv_file, index=False)