<a href="https://colab.research.google.com/github/DPravallika2005/SmartPaper/blob/main/Publishable_or_not.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import re
import numpy as np
import PyPDF2
import joblib
from transformers import pipeline, AutoTokenizer, AutoModel
import faiss
from collections import defaultdict
import torch
import csv

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Initialize Models and Utilities
# Load pre-trained classifier and vectorizer for publishability check
classifier = joblib.load("/content/drive/My Drive/research_paper_classifier.pkl")
vectorizer = joblib.load("/content/drive/My Drive/tfidf_vectorizer.pkl")

# Initialize the Embedding Model and Text Generation Model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
embedding_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
text_generation_model = pipeline("text-generation", model="gpt2")
summarization_model = pipeline("summarization", model="facebook/bart-large-cnn") # Initialize summarization model

# Define Conference Keywords and Reference Papers
conference_keywords = {
    "CVPR": ["object detection", "image segmentation", "computer vision tasks", "convolutional networks"],
    "EMNLP": ["language models", "semantic parsing", "text classification", "token embeddings"],
    "KDD": ["data clustering", "knowledge discovery", "graph mining", "recommendation systems"],
    "NeurIPS": ["stochastic gradient descent", "adversarial training", "multi-agent systems", "gradient stability"],
    "TMLR": ["optimization techniques", "mathematical proofs", "theoretical guarantees", "learning rates"]
}

# Reference papers for each conference (provided by the user)
conference_papers = {
    "CVPR": ["/content/R006.pdf", "/content/R007.pdf", "/content/cvpr7.pdf", "/content/cvpr6.pdf", "/content/cvpr5.pdf"],
    "EMNLP": ["/content/R008.pdf", "/content/R009.pdf", "/content/emnlp5.pdf", "/content/emnlp6.pdf", "/content/emnlp7.pdf"],
    "KDD": ["/content/R010.pdf", "/content/R011.pdf", "/content/kdd6.pdf", "/content/kdd7.pdf", "/content/kdd5.pdf"],
    "NeurIPS": ["/content/R012.pdf", "/content/R013.pdf", "/content/neurlps7.pdf", "/content/neurlps5.pdf", "/content/neurlps6.pdf"],
    "TMLR": ["/content/R014.pdf", "/content/R015.pdf", "/content/tmlr7.pdf", "/content/tmlr5.pdf", "/content/tmlr6.pdf"]
}

# Custom VectorStore Implementation with FAISS
class SimpleVectorStore:
    def init(self, dimension=768):
        self.vectors = []
        self.metadata = []
        self.index = faiss.IndexFlatL2(dimension)

    def add_vector(self, key, vector, metadata=None):
        if len(vector.shape) == 1:
            vector = vector.reshape(1, -1)
        self.vectors.append({"key": key, "vector": vector, "metadata": metadata})
        self.index.add(vector.astype(np.float32))

    def search(self, query_vector=None, key=None, top_k=10):
        if key:
            return [v for v in self.vectors if v["key"] == key]
        elif query_vector is not None:
            query_vector = query_vector.reshape(1, -1) if query_vector.ndim == 1 else query_vector
            distances, indices = self.index.search(query_vector.astype(np.float32), top_k)
            results = [
                {"key": self.vectors[idx]["key"], "score": 1 / (1 + distances[0][i]), "metadata": self.vectors[idx]["metadata"]}
                for i, idx in enumerate(indices[0])
            ]
            return results
        return []

vector_store = SimpleVectorStore()

# Step 2: Function to Extract Text from PDFs
def preprocess_text(text):
    """Cleans and preprocesses the text extracted from a PDF."""
    text = " ".join(text.split())  # Remove multiple spaces and newlines
    text = re.sub(r"(?i)References.*", "", text)  # Remove references section
    text = re.sub(r"(Figure|Table) \d+.*", "", text)  # Remove figure/table captions
    return text

def extract_text_from_pdf(pdf_path):
    """Extracts and preprocesses text from a given PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return preprocess_text(text.strip())

# Step 3: Function to Generate Embeddings
def create_embedding(text):
    """Generate embedding for the given text, handling chunking and truncation."""
    chunks = text.split()  # Split the text into words (no chunking needed for this case)
    tokenized = tokenizer(" ".join(chunks), truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        embedding = embedding_model(**tokenized).last_hidden_state.mean(dim=1).detach().numpy()
    return embedding

# Step 4: Conference Matching Function
def compute_keyword_overlap(text, conference):
    keywords = conference_keywords[conference]
    overlap = sum(1 for word in keywords if word in text.lower())
    return overlap

def generate_rationale(conference, new_paper_text):
    """
    Generate a rationale as a single, coherent 100-word paragraph for matching a paper to a conference.
    """
    # Combine content from reference papers of the matched conference
    conference_papers_list = [
        extract_text_from_pdf(paper) for paper in conference_papers[conference]
    ]
    conference_text = " ".join(conference_papers_list)

    # Input for rationale generation
    input_text = (
        f"The proposed paper introduces key concepts that align with the topics emphasized by {conference}. "
        f"It highlights: {new_paper_text[:300]}... "
        f"The conference focuses on areas like: {conference_text[:300]}. "
        f"Generate a coherent, formal, and concise 100-word paragraph explaining why this paper is suitable for the conference."
    )

    # Generate rationale using the summarization model
    rationale_output = summarization_model(
        input_text, max_length=100, min_length=100, do_sample=False
    )
    # Return the generated paragraph
    return rationale_output[0]['summary_text']

# Update the match_to_conference function to use the new rationale generation method
def match_to_conference(new_pdf_path):
    new_text = extract_text_from_pdf(new_pdf_path)
    new_vector = create_embedding(new_text)

    results = vector_store.search(query_vector=new_vector, top_k=10)

    similarity_sums = defaultdict(float)
    for result in results:
        conference = result['key']
        similarity = result['score']
        similarity_sums[conference] += similarity

    for conference in similarity_sums:
        overlap_score = compute_keyword_overlap(new_text, conference)
        similarity_sums[conference] += 0.5 * overlap_score

    sorted_conferences = sorted(similarity_sums.items(), key=lambda x: x[1], reverse=True)
    best_conference, best_score = sorted_conferences[0]

    # Generate rationale with the new function
    rationale = generate_rationale(best_conference, new_text)
    return best_conference, best_score, rationale

# Update the CSV generation loop to include the new rationale
def classify_and_match_papers(input_folder):
    rows = []
    for conference, papers in conference_papers.items():
        for paper_path in papers:
            paper_text = extract_text_from_pdf(paper_path)
            paper_embedding = create_embedding(paper_text)
            vector_store.add_vector(conference, paper_embedding, metadata={"path": paper_path})

    for filename in os.listdir(input_folder):
        if filename.endswith(".pdf"):
            paper_id = filename.split(".")[0]
            pdf_path = os.path.join(input_folder, filename)

            # Extract text and predict publishability
            pdf_text = extract_text_from_pdf(pdf_path)
            pdf_features = vectorizer.transform([pdf_text])
            prediction = classifier.predict(pdf_features)

            if prediction == 1:
                # Publishable paper: match to a conference and generate rationale
                conference, score, rationale = match_to_conference(pdf_path)
                rows.append([paper_id, 1, conference, rationale])
            else:
                # Non-publishable paper: label as 'NA'
                rows.append([paper_id, 0, "NA", "NA"])

    # Save results to CSV
    with open("/content/drive/My Drive/research_paper_classification.csv", mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Paper ID", "Publishable", "Conference", "Rationale"])
        writer.writerows(rows)


# Step 7: Run the Process
input_folder = "/content/drive/My Drive/Test_Papers"
classify_and_match_papers(input_folder)

print("CSV file with results has been saved.")