<a href="https://colab.research.google.com/github/DPravallika2005/SmartPaper/blob/main/Classification_of_Conferences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torch transformers pdfplumber sklearn nltk


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
!pip install pdfplumber


Collecting pdfplumber
  Using cached pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
import os
import nltk
import pdfplumber
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to folder in Google Drive
folder_path = '/content/drive/My Drive/Reference_Papers'

# Check if the folder exists
if os.path.exists(folder_path):
    if os.path.isdir(folder_path):
        print("The folder exists.")
    else:
        print("The path exists, but it is not a folder.")
else:
    print("The folder does not exist.")

# Download NLTK data
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

# === Step 1: Preprocessing ===

def preprocess_text(text):
    """Preprocess text: lowercase, remove stop words, punctuation."""
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# === Step 2: Load Data from PDF Files ===

def load_data_from_pdf(folder_path, labels_dict):
    """Load data directly from PDF files."""
    texts, labels = [], []

    for file_name, label in labels_dict.items():
        file_path = os.path.join(folder_path, file_name)

        if not os.path.exists(file_path):
            print(f"File {file_name} does not exist in the folder.")
            continue

        try:
            with pdfplumber.open(file_path) as pdf:
                text = ''.join(page.extract_text() for page in pdf.pages)
                texts.append(preprocess_text(text.strip()))
                labels.append(label)
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

    return texts, labels

# === Step 3: Define Dataset and Model ===

class ResearchPapersDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt",
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class BERTClassifier(nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, 2)  # 2 classes: Publishable/Non-Publishable

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # CLS token representation
        output = self.drop(pooled_output)
        return self.out(output)

# === Step 4: Train the Model ===

def train_model(train_loader, val_loader, model, optimizer, criterion, device, epochs=3):
    """Train the BERT classifier."""
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

        # Validate
        model.eval()
        val_preds, val_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs, dim=1)

                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        print("Validation Accuracy:", accuracy_score(val_labels, val_preds))

    return model

# === Step 5: Inference ===

def predict(model, test_loader, device):
    """Predict using the trained model."""
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())

    return preds

# === Main Workflow ===

def main():
    # Path to folder containing PDF files in Google Drive
    folder_path = '/content/drive/My Drive/Reference_Papers'

    # Define labels for each file
    labels_dict = {
        "R001.pdf": 0,  # Publishable
        "R002.pdf": 0,  # Non-Publishable
        "R003.pdf": 0,  # Publishable
        "R004.pdf": 0,
        "R005.pdf": 0,  # Publishable
        "R006.pdf": 1,
        "R007.pdf": 1,  # Publishable
        "R008.pdf": 1,
        "R009.pdf": 1,  # Publishable
        "R010.pdf": 1,
        "R011.pdf": 1,  # Publishable
        "R012.pdf": 1,
        "R013.pdf": 1,  # Publishable
        "R014.pdf": 1,
        "R015.pdf": 1,
        # Add more files as needed
    }

    # Load and preprocess data directly from PDF files in Google Drive
    texts, labels = load_data_from_pdf(folder_path, labels_dict)

    if not texts:
        print("No data found. Please check the folder path and files.")
        return

    # Split data into training and validation sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

    # Initialize tokenizer and datasets
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    max_length = 512

    train_dataset = ResearchPapersDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = ResearchPapersDataset(val_texts, val_labels, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)

    # Initialize model, optimizer, and loss function
    model = BERTClassifier()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Train the model
    model = train_model(train_loader, val_loader, model, optimizer, criterion, device, epochs=3)

    # Save the model
    torch.save(model.state_dict(), "bert_classifier.pth")
    print("Model saved as bert_classifier.pth")

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The folder exists.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/3, Loss: 0.8153330683708191
Validation Accuracy: 0.3333333333333333
Epoch 2/3, Loss: 0.7370777428150177
Validation Accuracy: 0.6666666666666666
Epoch 3/3, Loss: 0.6673690378665924
Validation Accuracy: 0.6666666666666666
Model saved as bert_classifier.pth


In [None]:
import os
import nltk
import pdfplumber
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to folder in Google Drive
folder_path = '/content/drive/My Drive/Reference_Papers'

# Check if the folder exists
if os.path.exists(folder_path):
    if os.path.isdir(folder_path):
        print("The folder exists.")
    else:
        print("The path exists, but it is not a folder.")
else:
    print("The folder does not exist.")

# Download NLTK data
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

# === Step 1: Preprocessing ===

def preprocess_text(text):
    """Preprocess text: lowercase, remove stop words, punctuation."""
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# === Step 2: Load Data from PDF Files ===

def load_data_from_pdf(folder_path, labels_dict):
    """Load data directly from PDF files."""
    texts, labels = [], []

    for file_name, label in labels_dict.items():
        file_path = os.path.join(folder_path, file_name)

        if not os.path.exists(file_path):
            print(f"File {file_name} does not exist in the folder.")
            continue

        try:
            with pdfplumber.open(file_path) as pdf:
                text = ''.join(page.extract_text() for page in pdf.pages)
                texts.append(preprocess_text(text.strip()))
                labels.append(label)
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

    return texts, labels

# === Step 3: Define Dataset and Model ===

class ResearchPapersDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt",
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class BERTClassifier(nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, 2)  # 2 classes: Publishable/Non-Publishable

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # CLS token representation
        output = self.drop(pooled_output)
        return self.out(output)

# === Step 4: Train the Model ===

def train_model(train_loader, val_loader, model, optimizer, criterion, device, epochs=3):
    """Train the BERT classifier."""
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

        # Validate
        model.eval()
        val_preds, val_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs, dim=1)

                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        print("Validation Accuracy:", accuracy_score(val_labels, val_preds))

    return model

# === Step 5: Inference ===

def predict(model, test_loader, device):
    """Predict using the trained model."""
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())

    return preds

# === Step 6: Test the Model on New PDF ===

def test_new_file(file_path, model, tokenizer, device, max_length=512):
    """Test the trained model on a new PDF file."""
    # Extract and preprocess text from the new PDF
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ''.join(page.extract_text() for page in pdf.pages)
            processed_text = preprocess_text(text.strip())
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return

    # Tokenize the text
    encoding = tokenizer(
        processed_text,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors="pt",
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make a prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs, dim=1).item()  # Get the predicted class (0 or 1)

    # Return the prediction
    return "Publishable" if prediction == 0 else "Non-Publishable"


# === Main Workflow ===

def main():
    # Path to folder containing PDF files in Google Drive
    folder_path = '/content/drive/My Drive/Reference_Papers'

    # Define labels for each file
    labels_dict = {
        "R001.pdf": 0,  # Publishable
        "R002.pdf": 0,  # Non-Publishable
        "R003.pdf": 0,  # Publishable
        "R004.pdf": 0,
        "R005.pdf": 0,  # Publishable
        "R006.pdf": 1,
        "R007.pdf": 1,  # Publishable
        "R008.pdf": 1,
        "R009.pdf": 1,  # Publishable
        "R010.pdf": 1,
        "R011.pdf": 1,  # Publishable
        "R012.pdf": 1,
        "R013.pdf": 1,  # Publishable
        "R014.pdf": 1,
        "R015.pdf": 1,
        # Add more files as needed
    }

    # Load and preprocess data directly from PDF files in Google Drive
    texts, labels = load_data_from_pdf(folder_path, labels_dict)

    if not texts:
        print("No data found. Please check the folder path and files.")
        return

    # Split data into training and validation sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

    # Initialize tokenizer and datasets
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    max_length = 512

    train_dataset = ResearchPapersDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = ResearchPapersDataset(val_texts, val_labels, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)

    # Initialize model, optimizer, and loss function
    model = BERTClassifier()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Train the model
    model = train_model(train_loader, val_loader, model, optimizer, criterion, device, epochs=3)

    # Save the model
    torch.save(model.state_dict(), "bert_classifier.pth")
    print("Model saved as bert_classifier.pth")

    # Load the trained model
    model = BERTClassifier()
    model.load_state_dict(torch.load("bert_classifier.pth"))
    model.to(device)

    # Test a new file
    new_file_path = '/content/R007.pdf'  # Specify the new file here
    result = test_new_file(new_file_path, model, tokenizer, device)
    print(f"The file is classified as: {result}")


if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The folder exists.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/3, Loss: 0.6551213264465332
Validation Accuracy: 0.6666666666666666
Epoch 2/3, Loss: 0.6568460166454315
Validation Accuracy: 0.6666666666666666
Epoch 3/3, Loss: 0.7432746887207031
Validation Accuracy: 0.6666666666666666
Model saved as bert_classifier.pth


  model.load_state_dict(torch.load("bert_classifier.pth"))


The file is classified as: Publishable


In [None]:
pip install imbalanced-learn




In [None]:
pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
pip install faiss-gpu


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


**WORKING CODE**

In [None]:
import re
import numpy as np
import PyPDF2
from transformers import pipeline, AutoTokenizer, AutoModel
import faiss
from collections import defaultdict
import torch

# Step 1: Initialize the Embedding Model and Text Generation Model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
embedding_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
text_generation_model = pipeline("text-generation", model="gpt2")  # GPT-2 for rationale generation

# Step 2: Function to Extract Text from PDFs
def preprocess_text(text):
    """Cleans and preprocesses the text extracted from a PDF."""
    text = " ".join(text.split())  # Remove multiple spaces and newlines
    text = re.sub(r"(?i)References.*", "", text)  # Remove references section
    text = re.sub(r"(Figure|Table) \d+.*", "", text)  # Remove figure/table captions
    return text

def extract_text_from_pdf(pdf_path):
    """Extracts and preprocesses text from a given PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return preprocess_text(text.strip())

# Step 3: Text Chunking Function (Handle Large Texts)
def chunk_text(text, chunk_size=512):
    """Split text into chunks of a given token size."""
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def create_embedding(text):
    """Generate embedding for the given text, handling chunking and truncation."""
    chunks = chunk_text(text, chunk_size=512)
    chunk_vectors = []

    for chunk in chunks:
        # Tokenize and truncate the chunk
        tokens = tokenizer(chunk, truncation=True, max_length=512, return_tensors="pt")
        # Generate embeddings
        with torch.no_grad():
            embeddings = embedding_model(**tokens).last_hidden_state.mean(dim=1).detach().numpy()
        chunk_vectors.append(embeddings)

    # Ensure final vector is 2D
    return np.mean(chunk_vectors, axis=0).reshape(1, -1)

# Step 4: Custom VectorStore Implementation with FAISS for Similarity Search
class SimpleVectorStore:
    def __init__(self, dimension=768): # Corrected the method name to __init__
        self.vectors = []  # List to store vectors
        self.metadata = []  # List to store metadata
        self.index = faiss.IndexFlatL2(dimension)  # 768-dim vectors for SciBERT

    def add_vector(self, key, vector, metadata=None):
        """Add a vector to the FAISS index."""
        # Ensure vector is 2D
        if len(vector.shape) == 1:
            vector = vector.reshape(1, -1)  # Reshape to (1, dimension)
        self.vectors.append({"key": key, "vector": vector, "metadata": metadata})
        self.index.add(vector.astype(np.float32))  # Add to FAISS index

    def search(self, query_vector=None, key=None, top_k=10):
        """Search for the most similar vectors."""
        if key:
            return [v for v in self.vectors if v["key"] == key]
        elif query_vector is not None:
            # Reshape the query_vector if necessary to ensure it's 2D
            query_vector = query_vector.reshape(1, -1) if query_vector.ndim == 1 else query_vector
            distances, indices = self.index.search(query_vector.astype(np.float32), top_k)
            results = [
                {"key": self.vectors[idx]["key"], "score": 1 / (1 + distances[0][i]), "metadata": self.vectors[idx]["metadata"]}
                for i, idx in enumerate(indices[0])
            ]
            return results
        return []
# Initialize the custom vector store
vector_store = SimpleVectorStore()

# Step 5: Store Conference Papers in the VectorStore
def store_conference_papers(conference_papers):
    """Store all conference papers in the VectorStore."""
    for conference, pdf_paths in conference_papers.items():
        for pdf_path in pdf_paths:
            paper_text = extract_text_from_pdf(pdf_path)
            vector = create_embedding(paper_text)
            vector_store.add_vector(key=conference, vector=vector, metadata={"text": paper_text})
    print("All conference papers have been stored in the VectorStore.")

# Step 6: Define Conference Keywords
conference_keywords = {
    "CVPR": ["object detection", "image segmentation", "computer vision tasks", "convolutional networks"],
    "EMNLP": ["language models", "semantic parsing", "text classification", "token embeddings"],
    "KDD": ["data clustering", "knowledge discovery", "graph mining", "recommendation systems"],
    "NeurIPS": ["stochastic gradient descent", "adversarial training", "multi-agent systems"],
    "TMLR": ["optimization techniques", "mathematical proofs", "theoretical guarantees", "learning rates"]
}


def compute_keyword_overlap(text, conference):
    """Compute overlap between text and conference keywords."""
    keywords = conference_keywords[conference]
    overlap = sum(1 for word in keywords if word in text.lower())
    return overlap

# Step 7: Match a New Paper to a Conference
def match_to_conference(new_pdf_path):
    """Match a new paper to the most relevant conference."""
    new_text = extract_text_from_pdf(new_pdf_path)
    new_vector = create_embedding(new_text)

    results = vector_store.search(query_vector=new_vector, top_k=10)

    similarity_sums = defaultdict(float)
    for result in results:
        conference = result['key']
        similarity = result['score']
        similarity_sums[conference] += similarity

    for conference in similarity_sums:
        overlap_score = compute_keyword_overlap(new_text, conference)
        similarity_sums[conference] += overlap_score

    best_conference = max(similarity_sums, key=similarity_sums.get)
    best_score = similarity_sums[best_conference]

    rationale = generate_rationale(best_conference, new_text)
    return best_conference, best_score, rationale

# Step 8: Generate Rationale for the Matched Conference
def generate_rationale(conference, new_paper_text):
    """Generate a rationale explaining why the paper matches the selected conference."""
    conference_papers = []
    for result in vector_store.search(query_vector=None, key=conference, top_k=10):
        conference_papers.append(result['metadata']['text'])

    conference_text = " ".join(conference_papers)

    input_text = f"The new paper's content is as follows: {new_paper_text[:500]}... The conference {conference} focuses on {conference_text[:500]}..."

    rationale = text_generation_model(input_text, max_new_tokens=200, num_return_sequences=1)
    return rationale[0]['generated_text']

# Step 9: Define File Paths for Conference Papers
conference_papers = {
    "CVPR": ["/content/R006.pdf", "/content/R007.pdf", "/content/cvpr7.pdf" , "/content/cvpr6.pdf", "/content/cvpr5.pdf"],
    "EMNLP": ["/content/R008.pdf", "/content/R009.pdf", "/content/emnlp5.pdf", "/content/emnlp6.pdf", "/content/emnlp7.pdf"],
    "KDD": ["/content/R010.pdf", "/content/R011.pdf", "/content/kdd6.pdf", "/content/kdd7.pdf", "/content/kdd5.pdf"],
    "NeurIPS": ["/content/R012.pdf", "/content/R013.pdf", "/content/neurlps7.pdf", "/content/neurlps5.pdf", "/content/neurlps6.pdf"],
    "TMLR": ["/content/R014.pdf", "/content/R015.pdf", "/content/tmlr7.pdf", "/content/tmlr5.pdf", "/content/tmlr6.pdf"]
}

# Step 10: Store the Papers in the VectorStore
store_conference_papers(conference_papers)

# Step 11: Match a New Paper to a Conference and Generate Rationale
new_paper_path = "/content/P104.pdf"
matched_conference, total_score, rationale = match_to_conference(new_paper_path)

print(f"The new paper is most similar to Conference {matched_conference} with a total similarity score of {total_score:.2f}")
print(f"Rationale for matching the paper to Conference {matched_conference}: {rationale}")


Device set to use cuda:0


All conference papers have been stored in the VectorStore.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The new paper is most similar to Conference EMNLP with a total similarity score of 1.11
Rationale for matching the paper to Conference EMNLP: The new paper's content is as follows: Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference Abstract While large pre-trained language models are powerful, their predictions often lack logical consistency across test inputs. For example, a state-of-the-art Macaw question-answering (QA) model answers Yes to Is a sparrow a bird? and Does a bird have feet? but answers No to Does a sparrow have feet?. To address this failure mode, we propose a framework, Consistency Correction throug... The conference EMNLP focuses on Advanced techniques for through and contextually Interpreting Noun-Noun Compounds Abstract This study examines the effectiveness of transfer learning and multi-task learning in the context of a complex semantic classification problem: understanding the meaning of noun-noun compounds

**WORKING CODE 1**

In [None]:
import re
import numpy as np
import PyPDF2
from transformers import pipeline, AutoTokenizer, AutoModel
import faiss
from collections import defaultdict
import torch

# Step 1: Initialize the Embedding Model and Text Generation Model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
embedding_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
text_generation_model = pipeline("text-generation", model="gpt2")

# Step 2: Function to Extract Text from PDFs
def preprocess_text(text):
    """Cleans and preprocesses the text extracted from a PDF."""
    text = " ".join(text.split())  # Remove multiple spaces and newlines
    text = re.sub(r"(?i)References.*", "", text)  # Remove references section
    text = re.sub(r"(Figure|Table) \d+.*", "", text)  # Remove figure/table captions
    return text

def extract_text_from_pdf(pdf_path):
    """Extracts and preprocesses text from a given PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return preprocess_text(text.strip())

# Step 3: Text Chunking Function (Handle Large Texts)
def chunk_text(text, chunk_size=512):
    """Split text into chunks of a given token size."""
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] # Corrected the range function call

def create_embedding(text):
    """Generate embedding for the given text, handling chunking and truncation."""
    chunks = chunk_text(text, chunk_size=512)
    chunk_vectors = []

    for chunk in chunks:
        tokens = tokenizer(chunk, truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            embeddings = embedding_model(**tokens).last_hidden_state.mean(dim=1).detach().numpy()
        chunk_vectors.append(embeddings)

    return np.mean(chunk_vectors, axis=0).reshape(1, -1)

# Step 4: Custom VectorStore Implementation with FAISS for Similarity Search
class SimpleVectorStore:
    def __init__(self, dimension=768):
        self.vectors = []
        self.metadata = []
        self.index = faiss.IndexFlatL2(dimension)

    def add_vector(self, key, vector, metadata=None):
        if len(vector.shape) == 1:
            vector = vector.reshape(1, -1)
        self.vectors.append({"key": key, "vector": vector, "metadata": metadata})
        self.index.add(vector.astype(np.float32))

    def search(self, query_vector=None, key=None, top_k=10):
        if key:
            return [v for v in self.vectors if v["key"] == key]
        elif query_vector is not None:
            query_vector = query_vector.reshape(1, -1) if query_vector.ndim == 1 else query_vector
            distances, indices = self.index.search(query_vector.astype(np.float32), top_k)
            results = [
                {"key": self.vectors[idx]["key"], "score": 1 / (1 + distances[0][i]), "metadata": self.vectors[idx]["metadata"]}
                for i, idx in enumerate(indices[0])
            ]
            return results
        return []

vector_store = SimpleVectorStore()

# Step 5: Store Conference Papers in the VectorStore
def store_conference_papers(conference_papers):
    for conference, pdf_paths in conference_papers.items():
        for pdf_path in pdf_paths:
            paper_text = extract_text_from_pdf(pdf_path)
            vector = create_embedding(paper_text)
            vector_store.add_vector(key=conference, vector=vector, metadata={"text": paper_text})
    print("All conference papers have been stored in the VectorStore.")

# Step 6: Define Conference Keywords
conference_keywords = {
    "CVPR": ["object detection", "image segmentation", "computer vision tasks", "convolutional networks"],
    "EMNLP": ["language models", "semantic parsing", "text classification", "token embeddings"],
    "KDD": ["data clustering", "knowledge discovery", "graph mining", "recommendation systems"],
    "NeurIPS": ["stochastic gradient descent", "adversarial training", "multi-agent systems", "gradient stability"],
    "TMLR": ["optimization techniques", "mathematical proofs", "theoretical guarantees", "learning rates"]
}

def compute_keyword_overlap(text, conference):
    keywords = conference_keywords[conference]
    overlap = sum(1 for word in keywords if word in text.lower())
    return overlap

# Step 7: Match a New Paper to a Conference
def match_to_conference(new_pdf_path):
    new_text = extract_text_from_pdf(new_pdf_path)
    new_vector = create_embedding(new_text)

    results = vector_store.search(query_vector=new_vector, top_k=10)

    similarity_sums = defaultdict(float)
    for result in results:
        conference = result['key']
        similarity = result['score']
        similarity_sums[conference] += similarity

    for conference in similarity_sums:
        overlap_score = compute_keyword_overlap(new_text, conference)
        similarity_sums[conference] += 0.5 * overlap_score  # Increased weight for keyword overlap

    for conference in similarity_sums:
        if conference == "NeurIPS":
            similarity_sums[conference] += 0.3  # Small positive bias for NeurIPS

    sorted_conferences = sorted(similarity_sums.items(), key=lambda x: x[1], reverse=True)
    best_conference, best_score = sorted_conferences[0]

    if len(sorted_conferences) > 1:
        second_conference, second_score = sorted_conferences[1]
        if best_conference == "EMNLP" and second_conference == "NeurIPS" and (best_score - second_score) < 0.1:
            best_conference, best_score = second_conference, second_score

    rationale = generate_rationale(best_conference, new_text)
    return best_conference, best_score, rationale

# Step 8: Generate Rationale for the Matched Conference
def generate_rationale(conference, new_paper_text):
    conference_papers = []
    for result in vector_store.search(query_vector=None, key=conference, top_k=10):
        conference_papers.append(result['metadata']['text'])

    conference_text = " ".join(conference_papers)
    input_text = f"The new paper's content is as follows: {new_paper_text[:500]}... The conference {conference} focuses on {conference_text[:500]}..."

    rationale = text_generation_model(input_text, max_new_tokens=200, num_return_sequences=1)
    return rationale[0]['generated_text']

# Step 9: Define File Paths for Conference Papers
conference_papers = {
    "CVPR": ["/content/R006.pdf", "/content/R007.pdf", "/content/cvpr7.pdf", "/content/cvpr6.pdf", "/content/cvpr5.pdf"],
    "EMNLP": ["/content/R008.pdf", "/content/R009.pdf", "/content/emnlp5.pdf", "/content/emnlp6.pdf", "/content/emnlp7.pdf"],
    "KDD": ["/content/R010.pdf", "/content/R011.pdf", "/content/kdd6.pdf", "/content/kdd7.pdf", "/content/kdd5.pdf"],
    "NeurIPS": ["/content/R012.pdf", "/content/R013.pdf", "/content/neurlps7.pdf", "/content/neurlps5.pdf", "/content/neurlps6.pdf"],
    "TMLR": ["/content/R014.pdf", "/content/R015.pdf", "/content/tmlr7.pdf", "/content/tmlr5.pdf", "/content/tmlr6.pdf"]
}

# Step 10: Store the Papers in the VectorStore
store_conference_papers(conference_papers)

# Step 11: Match a New Paper to a Conference and Generate Rationale
new_paper_path = "/content/P010.pdf"
matched_conference, total_score, rationale = match_to_conference(new_paper_path)

print(f"The new paper is most similar to Conference {matched_conference} with a total similarity score of {total_score:.2f}")
print(f"Rationale for matching the paper to Conference {matched_conference}: {rationale}")


Device set to use cpu


All conference papers have been stored in the VectorStore.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The new paper is most similar to Conference NeurIPS with a total similarity score of 0.36
Rationale for matching the paper to Conference NeurIPS: The new paper's content is as follows: Enhanced Reinforcement Learning for Recommender Systems: Maximizing Sample Efficiency and Minimizing Variance Abstract Optimizing long-term user satisfaction in recommender systems, such as news feeds, is crucial during continuous user-system interactions. Reinforcement learning has shown promise in addressing this challenge. However, practical hurdles like low sample efficiency, potential risks, and high variance hinder the implementation of deep reinforcement learning in online systems. We int... The conference NeurIPS focuses on Safe Predictors for Input-Output Specification Enforcement Abstract This paper presents an approach for designing neural networks, along with other machine learning models, which adhere to a collection of input-output specifica- tions. Our method involves the construction of a