In [1]:
# Install necessary packages
# Use a requirements.txt file instead for better environment management
# !pip install transformers datasets torch PyPDF2 scikit-learn nltk seaborn matplotlib

In [2]:
import os
import PyPDF2
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aditi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Define paths
reference_dir = '/content/drive/MyDrive/Research-paper/references'
papers_dir = '/content/drive/MyDrive/Research-paper/papers'
publishable_dir = os.path.join(reference_dir, 'Publishable')
non_publishable_dir = os.path.join(reference_dir, 'Non-Publishable')
categories = ['CVPR', 'EMNLP', 'KDD', 'NeurIPS', 'TMLR']

In [5]:
# Extract text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

In [6]:
# Load labeled data
def load_labeled_data(folder, label):
    data = []
    for root, _, files in os.walk(folder):
        for filename in files:
            if filename.endswith('.pdf'):
                text = extract_text_from_pdf(os.path.join(root, filename))
                category = os.path.basename(root) if label == 'Publishable' else None
                data.append({'Text': text, 'Label': label, 'Category': category})
    return data

In [7]:
labeled_data = (
    load_labeled_data(publishable_dir, 'Publishable') + 
    load_labeled_data(non_publishable_dir, 'Non-Publishable')
)

In [8]:
# Load unlabeled data
def load_unlabeled_data(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.pdf'):
            text = extract_text_from_pdf(os.path.join(folder, filename))
            data.append({'Text': text, 'Label': None, 'Category': None})
    return data

In [9]:
unlabeled_data = load_unlabeled_data(papers_dir)

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/content/drive/MyDrive/Research-paper/papers'

In [None]:
# Create DataFrame
data = pd.DataFrame(labeled_data + unlabeled_data)
data['Binary_Label'] = data['Label'].map({'Publishable': 1, 'Non-Publishable': 0})

In [None]:
# Split data
labeled_data = data[data['Binary_Label'].notnull()]
X_train, X_test, y_train, y_test = train_test_split(
    labeled_data['Text'], labeled_data['Binary_Label'], test_size=0.2, random_state=42
)

In [None]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['Text'], padding='max_length', truncation=True)

In [None]:
# Prepare datasets
train_dataset = Dataset.from_dict({'Text': X_train.tolist(), 'Label': y_train.tolist()}).map(tokenize_function, batched=True)
test_dataset = Dataset.from_dict({'Text': X_test.tolist(), 'Label': y_test.tolist()}).map(tokenize_function, batched=True)

In [None]:
train_dataset = train_dataset.remove_columns(['Text'])
test_dataset = test_dataset.remove_columns(['Text'])

In [None]:
train_dataset = train_dataset.rename_column('Label', 'labels')
test_dataset = test_dataset.rename_column('Label', 'labels')

In [None]:
train_dataset.set_format('torch')
test_dataset.set_format('torch')

In [None]:
# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True
)

In [None]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate
preds_output = trainer.predict(test_dataset)
y_pred = preds_output.predictions.argmax(axis=1)
y_true = torch.tensor(y_test.tolist())

In [None]:
# Metrics
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
print(classification_report(y_true, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(pd.DataFrame(cm, index=['Non-Publishable', 'Publishable'], columns=['Non-Publishable', 'Publishable']),
            annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()