In [None]:
import torch
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes=2):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.lstm.num_layers * 2, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(self.lstm.num_layers * 2, x.size(0), self.lstm.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Use last time step's output
        return out


In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
import torch.nn as nn

# Load the CSV containing research papers
df = pd.read_csv("/content/iitkgpResearchPapers_final (1).csv")  # Replace with your CSV file
df=df[5:15]
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Chunking Function
def chunk_text(text, tokenizer, max_length=64):
    """
    Tokenizes the input text into chunks of max_length tokens.
    """
    tokens = tokenizer.encode(text, truncation=False, add_special_tokens=False)
    chunks = [
        tokens[i: i + max_length]
        for i in range(0, len(tokens), max_length)
    ]
    return chunks

# Custom Dataset for Chunks
class ResearchPaperChunkDataset(Dataset):
    def __init__(self, text, label, tokenizer, max_length=64):
        """
        Args:
            text: Full text of the research paper.
            label: Label of the research paper.
            tokenizer: Tokenizer to use.
            max_length: Maximum length of each chunk.
        """
        self.chunks = chunk_text(text, tokenizer, max_length)
        self.label = label
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        chunk = self.chunks[idx]

        # Decode the chunk back to text and tokenize for padding and formatting
        encoded = self.tokenizer(
            tokenizer.decode(chunk),
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(self.label, dtype=torch.long),
        }


In [None]:
# Hyperparameters
input_size = 64  # Token length per chunk
embedding_dim = 32
hidden_size = 32
num_layers = 2
num_classes = len(df["label"].unique())
learning_rate = 0.1
num_epochs = 1

# Initialize model, loss function, and optimizer
model = BiLSTM(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    num_layers=num_layers,
    num_classes=num_classes,
).to("cuda")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for index, row in df.iterrows():
        print(f"Processing research paper {index + 1}/{len(df)}")

        # Prepare dataset and dataloader for the current research paper
        paper_dataset = ResearchPaperChunkDataset(
            text=row["text"],
            label=row["label"],
            tokenizer=tokenizer,
            max_length=64,

        )
        paper_dataloader = DataLoader(paper_dataset, batch_size=1, shuffle=True)

        # Reset gradients for each paper
        optimizer.zero_grad()

        for batch in paper_dataloader:
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            label = batch["label"].to("cuda")

            # Forward pass
            outputs = model(input_ids)
            loss = criterion(outputs, label)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print(f"Chunk Loss: {loss.item()}")

        # Save the model weights after processing all chunks of a research paper
        torch.save(model.state_dict(), f"model_weights_paper_{index + 1}.pt")
        print(f"Saved model weights for paper {index + 1}")


Epoch 1/1
Processing research paper 6/10
Chunk Loss: 0.7101012468338013
Chunk Loss: 0.0018701935186982155
Chunk Loss: 5.960446742392378e-06
Chunk Loss: 2.3841855067985307e-07
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Loss: 0.0
Chunk Los

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = BiLSTM(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=32,
    hidden_size=32,
    num_layers=2,
    num_classes=2,
).to("cuda")
model.load_state_dict(torch.load("model_weights_paper_15.pt"))  # Replace with your trained model weights
model.eval()

# Function to read and extract text from a PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Chunking function
def chunk_text(text, tokenizer, max_length=64):
    tokens = tokenizer.encode(text, truncation=False, add_special_tokens=False)
    chunks = [
        tokens[i: i + max_length]
        for i in range(0, len(tokens), max_length)
    ]
    return chunks

# Prediction function
def predict_research_paper(pdf_path):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # Chunk the text
    chunks = chunk_text(text, tokenizer)

    # Initialize a list to store predictions for each chunk
    predictions = []

    # Process each chunk
    for chunk in chunks:
        # Convert chunk to input format
        encoded = tokenizer.decode(chunk)
        tokenized = tokenizer(
            encoded,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        input_ids = tokenized["input_ids"].to("cuda")

        # Forward pass through the model
        with torch.no_grad():
            outputs = model(input_ids)
            predicted_class = torch.argmax(outputs, dim=1).item()
            predictions.append(predicted_class)

    # Final prediction based on chunk predictions
    final_prediction = round(sum(predictions) / len(predictions))  # Majority voting
    return final_prediction, predictions

# Predict a new research paper
pdf_path = "/content/R007.pdf"
final_prediction, chunk_predictions = predict_research_paper(pdf_path)

print(f"Final Prediction for the Paper: {final_prediction} (1: Publishable, 0: Non-Publishable)")
print(f"Chunk Predictions: {chunk_predictions}")

  model.load_state_dict(torch.load("model_weights_paper_15.pt"))  # Replace with your trained model weights
Token indices sequence length is longer than the specified maximum sequence length for this model (7255 > 512). Running this sequence through the model will result in indexing errors


Final Prediction for the Paper: 0 (1: Publishable, 0: Non-Publishable)
Chunk Predictions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# This is LinearSVM Classifier

Execute the code from here

In [None]:
pip install transformers torch PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import torch
from transformers import AutoTokenizer
from PyPDF2 import PdfReader

In [None]:
import pandas as pd
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, precision_recall_fscore_support

nltk.download("punkt_tab")
nltk.download("stopwords")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!gdown 1V_TnVPXCNhPwAJG3GBIEZG4fiQxH1Mpx

In [None]:
df = pd.read_csv("/content/iitkgpResearchPapers_final (1).csv")

def preprocess_text(text):
    tokens=word_tokenize(text)
    tokens=[token.lower() for token in tokens]
    tokens=[token for token in tokens if token not in string.punctuation]
    stop_words = set(stopwords.words("english"))
    tokens=[token for token in tokens if token not in stop_words]
    stemmer=PorterStemmer()
    tokens=[stemmer.stem(token) for token in tokens]

    return " ".join(tokens)


df["processed_text"]=df["text"].apply(preprocess_text)

# TF-IDF Vectorization
tfidf=TfidfVectorizer(max_features=5000)
X=tfidf.fit_transform(df["processed_text"])
y=df["label"]


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

# LinearSVM Model
model=LinearSVC(random_state=42)
model.fit(X_train, y_train)

y_pred=model.predict(X_test)

precision,recall,f1, _=precision_recall_fscore_support(y_test, y_pred, average="binary")
classification_report_table=classification_report(y_test, y_pred, target_names=["Non-Publishable", "Publishable"])

print("Classification Report:")
print(classification_report_table)

# Tabular Summary of Results
results=pd.DataFrame({
    "Metric": ["Precision", "Recall", "F1-Score"],
    "Value": [precision, recall, f1]
})
print("\nEvaluation Summary:")
print(results)

Classification Report:
                 precision    recall  f1-score   support

Non-Publishable       0.00      0.00      0.00         1
    Publishable       0.50      1.00      0.67         1

       accuracy                           0.50         2
      macro avg       0.25      0.50      0.33         2
   weighted avg       0.25      0.50      0.33         2


Evaluation Summary:
      Metric     Value
0  Precision  0.500000
1     Recall  1.000000
2   F1-Score  0.666667


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
df

Unnamed: 0,text,label,processed_text
5,Advancements in 3D Food Modeling: A Review of ...,1,advanc 3d food model review metafood challeng ...
6,Addressing Min-Max Challenges in Nonconvex-Non...,1,address min-max challeng nonconvex-nonconcav p...
7,Examining the Convergence of Denoising Diffusi...,1,examin converg denois diffus probabilist model...
8,Detecting Medication Usage in Parkinson’s Dise...,1,detect medic usag parkinson ’ diseas multi-mod...
9,Addressing Popularity Bias with Popularity-Con...,1,address popular bia popularity-consci align co...
10,Analyzing Real-Time Group Coordination in Augm...,0,analyz real-tim group coordin augment danc per...
11,Transdimensional Properties of Graphite in Rel...,0,transdimension properti graphit relat chees co...
12,AI-Driven Personalization in Online Education ...,0,ai-driven person onlin educ platform har power...
13,Synergistic Convergence of Photosynthetic Path...,0,synergist converg photosynthet pathway subterr...


In [None]:
def extract_text_from_pdf(pdf_path):
    reader=PdfReader(pdf_path)
    text=""
    for page in reader.pages:
        text+=page.extract_text()
    return text

In [None]:
def predict_new_research_paper(pdf_path, tfidf, model):
    research_paper_text=extract_text_from_pdf(pdf_path)
    processed_text=preprocess_text(research_paper_text)
    text_vector=tfidf.transform([processed_text])
    prediction=model.predict(text_vector)[0]

    return prediction

In [None]:
#pdf_path="/content/R006.pdf"
#
#prediction=predict_new_research_paper(pdf_path, tfidf, model)
#
#if prediction==1:
#    print("Prediction: Publishable")
#else:
#    print("Prediction: Non-Publishable")

Prediction: Publishable


In [None]:
!gdown 1xl_-og0wjkEghOdhmW2oOwX6IyDFbqdp

Downloading...
From: https://drive.google.com/uc?id=1xl_-og0wjkEghOdhmW2oOwX6IyDFbqdp
To: /content/Papers-20250112T183415Z-001.zip
100% 12.5M/12.5M [00:00<00:00, 82.5MB/s]


In [None]:
import os
from zipfile import ZipFile

In [None]:
zip_path="/content/Papers-20250112T183415Z-001.zip"
extracted_folder="/content/extracted_papers"

os.makedirs(extracted_folder,exist_ok=True)

with ZipFile(zip_path,'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

pdf_files=[]
for root, _,files in os.walk(extracted_folder):
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.join(root,file))

print(f"Found {len(pdf_files)} PDF files.")

In [None]:
results=[]
for pdf_file in pdf_files:
    prediction=predict_new_research_paper(pdf_file,tfidf,model)
    results.append({
        "PDF File": os.path.basename(pdf_file),
        "Prediction": "Publishable" if prediction == 1 else "Non-Publishable"
    })
results_df=pd.DataFrame(results)

print("\nPredictions:")
print(results_df)

#For Counting values
value_counts=results_df["Prediction"].value_counts()
print("\nValue Counts:")
print(value_counts)

In [None]:
results_df.to_csv("/content/predictions.csv")