# Introduction
This project processes a collection of PDF papers, extracts relevant sections, and applies machine learning models to predict suitable conferences for the papers. The process includes data extraction, sentiment analysis, grammar checking, and classification.




### 1. Data Collection and Setup
Here, we import the necessary modules and mount Google Drive to access the dataset stored in the specified folder. We list all files to ensure we have access to the paper files.


In [None]:
# Import the Google Colab drive module
from google.colab import drive

# Mount your Google Drive to the /content/drive directory
drive.mount('/content/drive')

# Once executed, this will prompt you to authorize access to your Google Drive.

Mounted at /content/drive


In [None]:
# Import required modules
import os
import pandas as pd

# Define the main folder path
folder_path = '/content/drive/MyDrive/Reference 2'

# List all files in the main folder
files = os.listdir(folder_path)
print("Files in folder:", files)

# Initialize a list to store the results
data = []

# Loop through the folders ('Publishable' and 'Non-Publishable')
for category in ['Publishable', 'Non-Publishable']:
    category_path = os.path.join(folder_path, category)  # Create the path for the category folder

    # Check if the category folder exists
    if os.path.exists(category_path):

        # Process 'Publishable' papers
        if category == 'Publishable':
            # Get the list of subfolders (conference names)
            conference_folders = os.listdir(category_path)
            for conference in conference_folders:
                conference_path = os.path.join(category_path, conference)

                # Ensure it's a directory (conference folder)
                if os.path.isdir(conference_path):
                    # Iterate through files in the conference folder
                    for file_name in os.listdir(conference_path):
                        file_path = os.path.join(conference_path, file_name)
                        if file_name.endswith('.pdf'):  # Filter for PDF files
                            # Append paper data with category and conference name
                            data.append({
                                'Paper_ID': file_name,
                                'Publishable': 'Publishable',
                                'Conference': conference
                            })

        # Process 'Non-Publishable' papers
        else:
            for file_name in os.listdir(category_path):
                file_path = os.path.join(category_path, file_name)
                if file_name.endswith('.pdf'):  # Filter for PDF files
                    # Append paper data with category
                    data.append({
                        'Paper_ID': file_name,
                        'Publishable': 'Non-Publishable',
                        'Conference': None  # No conference for non-publishable papers
                    })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the first 15 rows of the DataFrame
print(df.head(15))

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Reference 2'

### 2. Text Extraction and Section Parsing
This step involves reading the text content of the papers and extracting key sections: Abstract, Introduction, and Conclusion. The text is cleaned by removing punctuation and converting it to lowercase for uniformity.



### 3. Data Processing and DataFrame Creation
This step processes the PDF papers, extracts the Abstract, Introduction, and Conclusion sections, and classifies them into categories like 'Publishable' or 'Non-Publishable'. The data is stored in a pandas DataFrame for further processing.


In [None]:
import os
import pdfplumber  # For extracting text from PDFs
import string  # For text cleaning
import pandas as pd  # For data manipulation
from textblob import TextBlob  # For advanced text analysis (optional)

# Clean text function
def clean_text(text):
    """
    Cleans input text by converting to lowercase,
    removing punctuation, and stripping whitespace.
    """
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

# Extract sections function
def extract_sections(text):
    """
    Extracts Abstract, Introduction, and Conclusion sections from the text.
    Returns cleaned sections.
    """
    abstract, intro, conclusion = "", "", ""
    text_lower = text.lower()

    # Extract Abstract
    abstract_start = text_lower.find("abstract")
    if abstract_start != -1:
        abstract_end = text_lower.find("introduction", abstract_start)
        abstract = text[abstract_start:abstract_end if abstract_end != -1 else len(text)]

    # Extract Introduction
    intro_start = text_lower.find("introduction")
    if intro_start != -1:
        intro_end = text_lower.find("conclusion", intro_start)
        intro = text[intro_start:intro_end if intro_end != -1 else len(text)]

    # Extract Conclusion
    conclusion_start = text_lower.find("conclusion")
    if conclusion_start != -1:
        conclusion = text[conclusion_start:len(text)]

    return clean_text(abstract), clean_text(intro), clean_text(conclusion)

# Define the main folder path
folder_path = '/content/drive/MyDrive/Reference 2'

# Initialize data list
data = []

# Loop through the folders (Publishable and Non-Publishable)
for category in ['Publishable', 'Non-Publishable']:
    category_path = os.path.join(folder_path, category)

    # Check if the category folder exists
    if os.path.exists(category_path):

        # Process Publishable papers
        if category == 'Publishable':
            conference_folders = os.listdir(category_path)
            for conference in conference_folders:
                conference_path = os.path.join(category_path, conference)

                # Ensure it's a directory
                if os.path.isdir(conference_path):
                    for file_name in os.listdir(conference_path):
                        file_path = os.path.join(conference_path, file_name)
                        if file_name.endswith('.pdf'):  # Process only PDFs
                            try:
                                with pdfplumber.open(file_path) as pdf:
                                    text = ""
                                    for page in pdf.pages:
                                        page_text = page.extract_text()
                                        if page_text:
                                            text += page_text + "\n"

                                # Extract sections
                                abstract, intro, conclusion = extract_sections(text)

                                # Append data
                                data.append({
                                    'Paper_ID': file_name,
                                    'abstract': abstract,
                                    'introduction': intro,
                                    'conclusion': conclusion,
                                    'Publishable': 'Publishable',
                                    'Conference': conference
                                })
                            except Exception as e:
                                print(f"Error processing {file_name}: {e}")

        # Process Non-Publishable papers
        else:
            for file_name in os.listdir(category_path):
                file_path = os.path.join(category_path, file_name)
                if file_name.endswith('.pdf'):  # Process only PDFs
                    try:
                        with pdfplumber.open(file_path) as pdf:
                            text = ""
                            for page in pdf.pages:
                                page_text = page.extract_text()
                                if page_text:
                                    text += page_text + "\n"

                        # Extract sections
                        abstract, intro, conclusion = extract_sections(text)

                        # Append data
                        data.append({
                            'paper_ID': file_name,
                            'abstract': abstract,
                            'introduction': intro,
                            'conclusion': conclusion,
                            'Publishable': 'Unpublishable',
                            'Conference': None
                        })
                    except Exception as e:
                        print(f"Error processing {file_name}: {e}")

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the updated DataFrame with only the desired columns
df = df[['Paper_ID', 'abstract', 'introduction', 'Conclusion', 'Publishable', 'Conference']]
print(f"DataFrame created with {len(df)} entries.")
print(df.head(15))

DataFrame created with 15 entries.
   paper_name                                           abstract  \
0    R015.pdf  abstract\ndeepgenerativemodelsparticularlydiff...   
1    R014.pdf  abstract\nthisresearchexaminesaspecificcategor...   
2    R009.pdf  abstract\nthisstudydemonstratesthatincorporati...   
3    R008.pdf  abstract\nthisstudyexaminestheeffectivenessoft...   
4    R011.pdf  abstract\ncollaborativefilteringcfoftenencount...   
5    R010.pdf  abstract\nparkinson’sdiseasepdisaprogressivene...   
6    R006.pdf  abstract\nthisresearchintroducesmlbyoutubeanew...   
7    R007.pdf  abstract\nthegrowingfocusonleveragingcomputerv...   
8    R012.pdf  abstract\nthispaperpresentsanapproachfordesign...   
9    R013.pdf  abstract\nregressiontaskswhileaimingtomodelrel...   
10   R005.pdf  abstract\ntheconvergenceofaugmentedrealityaran...   
11   R001.pdf  abstract\ngraphite research has led to discove...   
12   R002.pdf  abstract\ntheperpetualoscillationsofquantumflu...   
13   R003.pdf

### 4. Feature Extraction and Model Training
This step vectorizes the text data using TF-IDF and trains a Logistic Regression model on the features to predict the suitable conference for each paper. The model is evaluated using classification metrics such as accuracy and precision.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Example conference mapping
conference_mapping = {
    'TMLR': 1,
    'EMNLP': 2,
    'KDD': 3,
    'CVPR': 4,
    'NeurIPS': 5,
    None: 0  # Map None or unpublishable papers to 0
}

# Map the conferences to numerical values
df['conference_mapped'] = df['conference'].map(conference_mapping)

# Combine text features into a single column
df['combined_text'] = df['abstract'] + " " + df['introduction'] + " " + df['conclusion']

# Filter rows to include only papers with valid conference mappings
df_filtered = df[df['conference_mapped'] > 0]

# Define features (X) and labels (y)
X = df_filtered['combined_text']  # Combined text features
y = df_filtered['conference_mapped']  # Target labels (conference IDs)

# Split the data into training and testing sets (50% test data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)

# Vectorize the text data using TF-IDF (Limit features to 5000 for efficiency)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
classifier = LogisticRegression(random_state=42, max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

# Print evaluation metrics
print("Classification Report:")

# Extract target names from the conference_mapping
target_names = [key for key, value in sorted(conference_mapping.items(), key=lambda item: item[1]) if value > 0]
print(classification_report(y_test, y_pred, target_names=target_names))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Classification Report:
              precision    recall  f1-score   support

        TMLR       0.00      0.00      0.00         1
       EMNLP       0.50      1.00      0.67         1
         KDD       0.00      0.00      0.00         1
        CVPR       0.50      1.00      0.67         1
     NeurIPS       1.00      1.00      1.00         1

    accuracy                           0.60         5
   macro avg       0.40      0.60      0.47         5
weighted avg       0.40      0.60      0.47         5

Accuracy: 0.60


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 5. Model Saving
After training the model, we save it using `joblib` for later use. This allows us to make predictions on new, unseen data without retraining the model.


In [None]:
import joblib

# Define the path to save the model
model_path = "IITKGP(3).joblib"

# Save the trained Logistic Regression model
joblib.dump(classifier, model_path)

# Confirmation message
print(f"Model saved to {model_path}")


Model saved to IITKGP(3).joblib


In [None]:
import os

# Define the folder path
folder_path = '/content/drive/MyDrive/Papers'

# List all files in the specified folder
files = os.listdir(folder_path)

# Print the list of files
print("Files in folder:", files)

Files in folder: ['P062.pdf', 'P126.pdf', 'P068.pdf', 'P008.pdf', 'P064.pdf', 'P121.pdf', 'P018.pdf', 'P024.pdf', 'P116.pdf', 'P090.pdf', 'P041.pdf', 'P016.pdf', 'P038.pdf', 'P081.pdf', 'P132.pdf', 'P130.pdf', 'P078.pdf', 'P022.pdf', 'P026.pdf', 'P033.pdf', 'P125.pdf', 'P133.pdf', 'P028.pdf', 'P118.pdf', 'P048.pdf', 'P039.pdf', 'P069.pdf', 'P077.pdf', 'P070.pdf', 'P134.pdf', 'P047.pdf', 'P027.pdf', 'P129.pdf', 'P105.pdf', 'P043.pdf', 'P036.pdf', 'P032.pdf', 'P073.pdf', 'P097.pdf', 'P086.pdf', 'P094.pdf', 'P128.pdf', 'P104.pdf', 'P031.pdf', 'P102.pdf', 'P002.pdf', 'P080.pdf', 'P100.pdf', 'P053.pdf', 'P023.pdf', 'P056.pdf', 'P066.pdf', 'P096.pdf', 'P119.pdf', 'P075.pdf', 'P050.pdf', 'P074.pdf', 'P007.pdf', 'P045.pdf', 'P013.pdf', 'P060.pdf', 'P110.pdf', 'P087.pdf', 'P005.pdf', 'P067.pdf', 'P017.pdf', 'P029.pdf', 'P051.pdf', 'P071.pdf', 'P108.pdf', 'P123.pdf', 'P001.pdf', 'P003.pdf', 'P107.pdf', 'P076.pdf', 'P006.pdf', 'P098.pdf', 'P106.pdf', 'P020.pdf', 'P035.pdf', 'P124.pdf', 'P037.pdf'

### 6. Prediction and Output Generation
This section involves generating predictions for the classification of research papers into relevant conferences. The text data (Abstract, Introduction, and Conclusion) from the papers is processed, and the trained model is used to predict the most suitable conference for each paper.

Key steps include:
- **Text Preprocessing**: Cleaning the text by converting it to lowercase, removing punctuation, and extracting relevant sections.
- **Sentiment Analysis**: Analyzing the sentiment of the text to gather insights about its tone and relevance.
- **Grammar Checking**: Using LanguageTool to check for grammatical errors in the paper text.
- **Prediction**: Using the trained model to predict the conference classification for each paper.

The output will include the predicted conference for each research paper along with the associated sections and analysis results.


In [None]:
import os
import pdfplumber
import string
import pandas as pd
import textstat
from textblob import TextBlob
import language_tool_python

# Initialize LanguageTool for grammar checking
tool = language_tool_python.LanguageTool('en-US')

# Function to clean text by converting to lowercase and removing punctuation
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

# Function to extract specific sections (Abstract, Introduction, Conclusion) from the text
def extract_sections(text):
    abstract, intro, conclusion = "", "", ""
    text_lower = text.lower()

    # Extract Abstract
    abstract_start = text_lower.find("abstract")
    if abstract_start != -1:
        abstract_end = text_lower.find("introduction", abstract_start)
        abstract = text[abstract_start:abstract_end if abstract_end != -1 else len(text)]

    # Extract Introduction
    intro_start = text_lower.find("introduction")
    if intro_start != -1:
        intro_end = text_lower.find("conclusion", intro_start)
        intro = text[intro_start:intro_end if intro_end != -1 else len(text)]

    # Extract Conclusion
    conclusion_start = text_lower.find("conclusion")
    if conclusion_start != -1:
        conclusion = text[conclusion_start:len(text)]

    return clean_text(abstract), clean_text(intro), clean_text(conclusion)

# Function to perform sentiment analysis on the text using TextBlob
def sentiment_analysis(text):
    sentiment = TextBlob(text).sentiment.polarity
    return sentiment  # Sentiment ranges from -1 (negative) to 1 (positive)

# Function to check writing quality (grammar issues and passive voice count)
def check_writing_quality(text):
    matches = tool.check(text)  # Get grammar issues from LanguageTool
    passive_voice_count = sum(1 for match in matches if 'Passive voice' in match.message)
    return {
        'grammar_errors': len(matches),
        'passive_voice_count': passive_voice_count
    }

# Define the folder containing the PDF files
folder_path = '/content/drive/MyDrive/Papers'

# Initialize an empty list to store processed data
data = []

# Process all PDF files in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)

    # Check if the file is a PDF
    if file_name.endswith('.pdf'):
        try:
            with pdfplumber.open(file_path) as pdf:
                text = ""
                # Extract text from all pages of the PDF
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"

                # Extract Abstract, Introduction, and Conclusion sections
                abstract, intro, conclusion = extract_sections(text)

                # Analyze sentiment for the entire document (optional)
                sentiment_score = sentiment_analysis(text)

                # Check grammar and writing quality (optional)
                quality_metrics = check_writing_quality(text)

                # Append data to the list
                data.append({
                    'Paper_ID': file_name,
                    'abstract': abstract,
                    'introduction': intro,
                    'Conclusion': conclusion,
                    'sentiment_score': sentiment_score,
                    'grammar_errors': quality_metrics['grammar_errors'],
                    'passive_voice_count': quality_metrics['passive_voice_count']
                })
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Create a DataFrame with the processed data
df = pd.DataFrame(data)

# Display a summary of the created DataFrame
print(f"DataFrame created with {len(df)} entries.")
print(df.head(15))


Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:04<00:00, 50.7MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmplqwrh_ow.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /root/.cache/language_tool_python.


DataFrame created with 135 entries.
   paper_name                                           abstract  \
0    P062.pdf  abstract\nthispaperexplorestheadaptationoflarg...   
1    P126.pdf  abstract\ntheobjectiveofthisresearchistodevelo...   
2    P068.pdf  abstract\nto address the challenges of tempora...   
3    P008.pdf  abstract\nthisresearchinvestigatesthemechanism...   
4    P064.pdf  abstract\nthegoalofthispaperistoempoweropensou...   
5    P121.pdf  abstract\ntheobjectiveofthisresearchistoaddres...   
6    P018.pdf  abstract\ntheobjectiveofthisresearchistoaddres...   
7    P024.pdf  abstract\nthis paper investigates the feasibil...   
8    P116.pdf  abstract\ntoenhancetheaccuracyandscalabilityof...   
9    P090.pdf  abstract\nthispaperexplorestheadaptationoflarg...   
10   P041.pdf  abstract\nmetaversearchaeologyrepresentsaparad...   
11   P016.pdf  abstract\nbayesian theology for extraterrestri...   
12   P038.pdf  abstract\ngraph neural networks gnns for predi...   
13   P081.pd

In [None]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the pre-trained Logistic Regression model and TF-IDF vectorizer
classifier = joblib.load('/content/IITKGP(3).joblib')
vectorizer = joblib.load('/content/tfidf_vectorizer.joblib')  # Load the TF-IDF vectorizer

# Assuming df is your DataFrame with 'abstract', 'introduction', and 'conclusion' columns
# Combine text features into one column
df['combined_text'] = df['abstract'] + " " + df['introduction'] + " " + df['conclusion']

# Vectorize the combined text data using the loaded TF-IDF vectorizer
X_tfidf = vectorizer.transform(df['combined_text'])

# Make predictions using the loaded model
y_pred = classifier.predict(X_tfidf)

### 7. Mapping Predicted Labels to Conferences
Here, we map the predicted labels (1: Publishable, 0: Non-Publishable) to corresponding conference names. If a paper is predicted as non-publishable, we assign `"NA"` as the conference.


In [None]:
# Map the predicted labels back to the conference names (inverse mapping)
conference_mapping_inv = {  # Ensure this matches your original mapping
    1: 'TMLR',
    2: 'EMNLP',
    3: 'KDD',
    4: 'CVPR',
    5: 'NeurIPS'
}

# Create a new column for predicted conference names
df['Conference'] = [conference_mapping_inv.get(label, 'Unknown') for label in y_pred]

# Select and display relevant columns
df = df[['Paper_ID', 'Conference']]
# Display the updated DataFrame
print(df.head(15))

In [None]:
# output_path = '/content/conference.csv'
# df.to_csv(output_path, index=False)

In [None]:
# Import necessary libraries
import joblib  # For loading the saved model
import pandas as pd  # For handling data in DataFrame format
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # For model evaluation

# Define the path to the saved model file
model_path = "/IITKGP.joblib"  # Make sure to update this path if the model is stored elsewhere

# Load the saved AdaBoost model using joblib
loaded_model = joblib.load(model_path)

# Print a confirmation message indicating that the model was loaded successfully
print("Model loaded successfully.")

FileNotFoundError: [Errno 2] No such file or directory: '/IITKGP.joblib'

In [None]:
# Import necessary libraries
import os
import pdfplumber  # For extracting text from PDFs
import string  # For string manipulation (removing punctuation)
import pandas as pd  # For working with DataFrames
import textstat  # For readability score calculations
from textblob import TextBlob  # For sentiment analysis
import language_tool_python  # For grammar and passive voice analysis

# Initialize LanguageTool for grammar checking
tool = language_tool_python.LanguageTool('en-US')

# Clean text by making it lowercase and removing punctuation
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    return text.strip()  # Remove extra spaces

# Extract sections (abstract, introduction, conclusion) from the text
def extract_sections(text):
    abstract, intro, conclusion = "", "", ""
    text_lower = text.lower()  # Convert text to lowercase for easier search

    # Extract Abstract
    abstract_start = text_lower.find("abstract")  # Look for "abstract" in the text
    if abstract_start != -1:
        abstract_end = text_lower.find("introduction", abstract_start)  # Find where the introduction starts
        abstract = text[abstract_start:abstract_end if abstract_end != -1 else len(text)]  # Extract abstract text

    # Extract Introduction
    intro_start = text_lower.find("introduction")  # Look for "introduction" in the text
    if intro_start != -1:
        intro_end = text_lower.find("conclusion", intro_start)  # Find where the conclusion starts
        intro = text[intro_start:intro_end if intro_end != -1 else len(text)]  # Extract introduction text

    # Extract Conclusion
    conclusion_start = text_lower.find("conclusion")  # Look for "conclusion" in the text
    if conclusion_start != -1:
        conclusion = text[conclusion_start:len(text)]  # Extract conclusion text

    return clean_text(abstract), clean_text(intro), clean_text(conclusion)  # Clean and return the sections

# Perform sentiment analysis using TextBlob (range from -1 to 1)
def sentiment_analysis(text):
    sentiment = TextBlob(text).sentiment.polarity  # Get the polarity score
    return sentiment  # Return the sentiment score

# Function to check grammar errors and passive voice in the text
def check_writing_quality(text):
    matches = tool.check(text)  # Check for grammar issues using LanguageTool
    passive_voice_count = sum(1 for match in matches if 'Passive voice' in match.message)  # Count passive voice occurrences
    return {
        'grammar_errors': len(matches),  # Total number of grammar errors
        'passive_voice_count': passive_voice_count  # Count of passive voice sentences
    }

# Define folder path where PDF papers are stored
folder_path = '/content/drive/MyDrive/Papers'

# Initialize an empty list to store data
data = []

# Loop through the files in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)  # Get the full file path

    if file_name.endswith('.pdf'):  # Process only PDF files
        try:
            with pdfplumber.open(file_path) as pdf:  # Open the PDF using pdfplumber
                text = ""
                for page in pdf.pages:  # Iterate over each page in the PDF
                    page_text = page.extract_text()  # Extract text from the page
                    if page_text:
                        text += page_text + "\n"  # Accumulate the text

                # Extract the sections: abstract, introduction, and conclusion
                abstract, intro, conclusion = extract_sections(text)

                # Calculate readability scores
                readability_score = textstat.flesch_kincaid_grade(conclusion)  # Flesch-Kincaid grade level for conclusion
                flesch_score = textstat.flesch_reading_ease(conclusion)  # Flesch Reading Ease score for conclusion

                # Perform sentiment analysis on the conclusion text
                sentiment_score = sentiment_analysis(conclusion)

                # Analyze writing quality (grammar and passive voice)
                writing_quality_features = check_writing_quality(conclusion)

                # Append the extracted features and analysis results into the data list
                data.append({
                    'paper_name': file_name,  # Name of the paper
                    'abstract': abstract,  # Extracted abstract
                    'introduction': intro,  # Extracted introduction
                    'conclusion': conclusion,  # Extracted conclusion
                    'conclusion_length': len(conclusion.split()),  # Length of the conclusion (word count)
                    'readability_score': readability_score,  # Flesch-Kincaid readability score
                    'flesch_score': flesch_score,  # Flesch Reading Ease score
                    'sentiment_score': sentiment_score,  # Sentiment score of the conclusion
                    'grammar_errors': writing_quality_features['grammar_errors'],  # Grammar errors count
                    'passive_voice_count': writing_quality_features['passive_voice_count'],  # Passive voice count
                    'label': 0  # Assuming label is 0 (Non-Publishable), adjust if needed
                })
        except Exception as e:
            print(f"Error processing {file_name}: {e}")  # Print error message if something goes wrong

# Convert the list of dictionaries into a pandas DataFrame for easy manipulation
df = pd.DataFrame(data)

# Rename the columns (if needed, but this step is optional)
df = df.rename(columns={
    'Paper_ID': 'paper_name',
    'abstract': 'abstract',
    'introduction': 'introduction',
    'conclusion': 'conclusion',
    'conclusion_length': 'conclusion_length',  # Renamed column for the conclusion length
    'readability_score': 'readability_score',
    'flesch_score': 'flesch_score',
    'sentiment_score': 'sentiment_score',
    'grammar_errors': 'grammar_errors',
    'passive_voice_count': 'passive_voice_count',
    'Publishable': 'label'
})

# Display the resulting DataFrame
print(f"DataFrame created with {len(df)} entries.")  # Print the number of entries
print(df.head(15))  # Display the first 15 rows of the DataFrame

DataFrame created with 135 entries.
   paper_name                                           abstract  \
0    P062.pdf  abstract\nthispaperexplorestheadaptationoflarg...   
1    P126.pdf  abstract\ntheobjectiveofthisresearchistodevelo...   
2    P068.pdf  abstract\nto address the challenges of tempora...   
3    P008.pdf  abstract\nthisresearchinvestigatesthemechanism...   
4    P064.pdf  abstract\nthegoalofthispaperistoempoweropensou...   
5    P121.pdf  abstract\ntheobjectiveofthisresearchistoaddres...   
6    P018.pdf  abstract\ntheobjectiveofthisresearchistoaddres...   
7    P024.pdf  abstract\nthis paper investigates the feasibil...   
8    P116.pdf  abstract\ntoenhancetheaccuracyandscalabilityof...   
9    P090.pdf  abstract\nthispaperexplorestheadaptationoflarg...   
10   P041.pdf  abstract\nmetaversearchaeologyrepresentsaparad...   
11   P016.pdf  abstract\nbayesian theology for extraterrestri...   
12   P038.pdf  abstract\ngraph neural networks gnns for predi...   
13   P081.pd

In [None]:
# Select only the required columns
selected_columns = ['conclusion_length', 'readability_score', 'flesch_score', 'sentiment_score', 'grammar_errors']

# Create a new DataFrame with only these columns
new_df = df[selected_columns]

# Display the result.head())

print(new_df.head())

   conclusion_length  readability_score  flesch_score  sentiment_score  \
0                262              140.9       -448.26         0.123773   
1                428              221.0       -726.73         0.086320   
2                 89              124.1       -636.45         0.096429   
3                  0              -15.7        206.84         0.000000   
4                124              130.7       -621.22         0.282857   

   grammar_errors  
0              56  
1             150  
2              44  
3               0  
4              74  


### 8. Model Prediction
In this step, we load the pre-trained model and make predictions on the prepared features (readability scores, sentiment score, etc.) of the papers. The predicted labels are added to the DataFrame.


In [None]:
# Make predictions on the new_df
predictions = loaded_model.predict(new_df)

# Add predictions to the new_df
new_df['Publishable'] = predictions

print(new_df.head())

   conclusion_length  readability_score  flesch_score  sentiment_score  \
0                262              140.9       -448.26         0.123773   
1                428              221.0       -726.73         0.086320   
2                 89              124.1       -636.45         0.096429   
3                  0              -15.7        206.84         0.000000   
4                124              130.7       -621.22         0.282857   

   grammar_errors  predicted_label  
0              56                0  
1             150                0  
2              44                0  
3               0                1  
4              74                0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['predicted_label'] = predictions


### 9. Combining DataFrames
In this step, we concatenate the predicted labels and predicted conference names with the original `paper_name` into a final DataFrame.


In [None]:
# Concatenate the required columns (paper_name, predicted_label, predicted_conference)
# from both the original dataframe (df) and the new dataframe (new_df) along the columns axis (axis=1)
final_df = pd.concat([df['paper_ID'], new_df['Publishable'], df['Conference']], axis=1)

# Display the first few rows of the final concatenated DataFrame
final_df.head()

Unnamed: 0,paper_name,predicted_label,predicted_conference
0,P062.pdf,0,CVPR
1,P126.pdf,0,CVPR
2,P068.pdf,0,CVPR
3,P008.pdf,1,EMNLP
4,P064.pdf,0,CVPR


### 10. Post-Processing
We update the `predicted_conference` column for non-publishable papers to `"NA"`. The DataFrame is then sorted by paper name.


In [None]:
# Update the 'predicted_conference' column to "NA" for rows where the 'predicted_label' is 0
# This is done by using the .loc method to filter the rows and then assigning "NA" to those specific rows
final_df.loc[final_df['Publishable'] == 0, 'Conference'] = "NA"

# Display the updated DataFrame after modifying the 'predicted_conference' values
print(final_df)

# Sort the final DataFrame by 'paper_name' column in ascending order
# This ensures that the papers are arranged alphabetically (or by their name) in the DataFrame
final_df = final_df.sort_values(by='Paper_ID', ascending=True)

    paper_name  predicted_label predicted_conference
0     P062.pdf                0                   NA
1     P126.pdf                0                   NA
2     P068.pdf                0                   NA
3     P008.pdf                1                EMNLP
4     P064.pdf                0                   NA
..         ...              ...                  ...
130   P059.pdf                1                 CVPR
131   P061.pdf                1                 CVPR
132   P004.pdf                1                 CVPR
133   P089.pdf                1              NeurIPS
134   P058.pdf                1              NeurIPS

[135 rows x 3 columns]


### 11. Saving the Results
Finally, we save the final DataFrame with predicted labels and conference names to a CSV file for further use and display a preview of the results.


In [None]:
# Save the sorted DataFrame to a CSV file
# 'index=False' ensures that the index column is not included in the output CSV file
output_path = '/content/conference_predictions.csv'
final_df.to_csv(output_path, index=False)

# Display the top 50 rows of the sorted DataFrame for preview
# This will show the first 50 rows of the DataFrame to verify the changes made
print(final_df.head(50))

    paper_name  predicted_label predicted_conference
71    P001.pdf                1                 CVPR
45    P002.pdf                0                   NA
72    P003.pdf                1                 CVPR
132   P004.pdf                1                 CVPR
63    P005.pdf                1                 CVPR
75    P006.pdf                0                   NA
57    P007.pdf                1                 CVPR
3     P008.pdf                1                EMNLP
95    P009.pdf                0                   NA
105   P010.pdf                1                EMNLP
93    P011.pdf                0                   NA
129   P012.pdf                0                   NA
59    P013.pdf                1                EMNLP
126   P014.pdf                1                 CVPR
87    P015.pdf                1                 CVPR
11    P016.pdf                0                   NA
65    P017.pdf                1                 CVPR
6     P018.pdf                0               

### 12. Logging into Hugging Face Hub
This command logs you into the Hugging Face Hub using your credentials. It prompts you to enter a Hugging Face API token, granting access to private repositories and model sharing functionalities.
```bash
!huggingface-cli login


In [None]:
# This command is used to log into the Hugging Face Hub using your credentials.
# It prompts you to enter a Hugging Face API token, allowing access to your private repositories and model sharing functionalities.
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineG

### 13. Configuring Git for Credential Storage
This command configures Git to store credentials in plain text to avoid re-entering them for future operations.

In [None]:
# Configures Git to store credentials in plain text to avoid re-entering them for future operations.
!git config --global credential.helper store

### 14. Text Extraction, Cleaning, and Data Processing
This step involves extracting the abstract section from PDF papers, cleaning the text by removing punctuation and converting it to lowercase, and then storing the paper names and cleaned abstracts in a pandas DataFrame for further analysis.

In [None]:
import os
import pdfplumber
import string
import pandas as pd

# Clean text function: removes punctuation and converts to lowercase
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    return text.strip()  # Return cleaned text

# Extract abstract function: finds and returns abstract text from the document
def extract_abstract(text):
    abstract = ""
    text_lower = text.lower()  # Convert text to lowercase for easier matching

    # Extract abstract section
    abstract_start = text_lower.find("abstract")
    if abstract_start != -1:
        abstract_end = text_lower.find("introduction", abstract_start)  # Find introduction after abstract
        abstract = text[abstract_start:abstract_end if abstract_end != -1 else len(text)]  # Extract abstract text

    return clean_text(abstract)  # Clean the extracted abstract text

# Define folder path where the PDFs are stored
folder_path = '/content/drive/MyDrive/Papers'

# List to hold paper data
data = []

# Process each PDF file in the folder
for file_name in os.listdir(folder_path):  # Iterate over files in the folder
    file_path = os.path.join(folder_path, file_name)

    if file_name.endswith('.pdf'):  # Check if the file is a PDF
        try:
            with pdfplumber.open(file_path) as pdf:  # Open the PDF file using pdfplumber
                text = ""
                for page in pdf.pages:  # Iterate through all pages in the PDF
                    page_text = page.extract_text()
                    if page_text:  # If text is extracted from the page, append it
                        text += page_text + "\n"

                # Extract abstract text from the full text
                abstract = extract_abstract(text)

                # Append the paper name and abstract to the data list
                data.append({
                    'Paper_ID': file_name,
                    'abstract': abstract
                })
        except Exception as e:  # Handle any errors during processing
            print(f"Error processing {file_name}: {e}")

# Create a DataFrame from the list of data
data = pd.DataFrame(data)

# Display the number of entries in the DataFrame and the first few rows
print(f"DataFrame created with {len(data)} entries.")
print(data.head(15))  # Display the first 15 rows of the DataFrame

DataFrame created with 135 entries.
   paper_name                                           abstract
0    P062.pdf  abstract\nthispaperexplorestheadaptationoflarg...
1    P126.pdf  abstract\ntheobjectiveofthisresearchistodevelo...
2    P068.pdf  abstract\nto address the challenges of tempora...
3    P008.pdf  abstract\nthisresearchinvestigatesthemechanism...
4    P064.pdf  abstract\nthegoalofthispaperistoempoweropensou...
5    P121.pdf  abstract\ntheobjectiveofthisresearchistoaddres...
6    P018.pdf  abstract\ntheobjectiveofthisresearchistoaddres...
7    P024.pdf  abstract\nthis paper investigates the feasibil...
8    P116.pdf  abstract\ntoenhancetheaccuracyandscalabilityof...
9    P090.pdf  abstract\nthispaperexplorestheadaptationoflarg...
10   P041.pdf  abstract\nmetaversearchaeologyrepresentsaparad...
11   P016.pdf  abstract\nbayesian theology for extraterrestri...
12   P038.pdf  abstract\ngraph neural networks gnns for predi...
13   P081.pdf  abstract\nthispaperdelvesintotheunchart

### 15. DataFrame Concatenation
This step combines columns from multiple DataFrames (df, data, new_df) into a new DataFrame data2 and displays the first few rows.

In [None]:
data2=pd.concat([df['Paper_ID'], data['abstract'] ,new_df['Publishable'] ,final_df['Conference'] ],axis=1)

data2.head()

Unnamed: 0,paper_name,abstract,predicted_label,predicted_conference
0,P062.pdf,abstract\nthispaperexplorestheadaptationoflarg...,0,CVPR
1,P126.pdf,abstract\ntheobjectiveofthisresearchistodevelo...,0,CVPR
2,P068.pdf,abstract\nto address the challenges of tempora...,0,CVPR
3,P008.pdf,abstract\nthisresearchinvestigatesthemechanism...,1,EMNLP
4,P064.pdf,abstract\nthegoalofthispaperistoempoweropensou...,0,CVPR


### 16. Model Initialization and Rationale Generation
This step loads the LLaMA model from Hugging Face and sets up a text generation pipeline. It then generates rationales for each paper in the DataFrame based on the abstract and predicted conference, storing the results in a new column.

In [None]:
import os
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Set your Hugging Face access token in an environment variable
os.environ["HUGGINGFACE_TOKEN"] = "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"  # Replace with your actual token

# Load T5 model and tokenizer using the token
model_name = "t5-small"
token = os.environ["HUGGINGFACE_TOKEN"]

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, use_auth_token=token)
llama_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

# Assuming `df`, `data`, and `new_df` are already defined DataFrames
# Ensure these DataFrames exist and have the correct columns
data2 = pd.concat([df['paper_name'], data['abstract'], new_df['predicted_label'], df['predicted_conference']], axis=1)
data2.columns = ['paper_name', 'abstract', 'predicted_label', 'predicted_conference']

# Initialize rationale column
rationales = []

for index, row in data2.iterrows():
    paper_name = row['Paper_ID']
    abstract = row['abstract']
    predicted_conference = row['Conference']

    if pd.isna(predicted_conference):  # Check if predicted_conference is NA
        rationale = "No rationale generated because predicted_conference is NaN."
    else:
        try:
            # Generate rationale using T5
            input_text = f"""
            Based on the abstract and the predicted conference classification, provide a rationale:

            Paper_ID: {paper_name}
            Abstract: {abstract}
            Conference: {predicted_conference}

            The rationale should explain why this paper is suitable for the predicted conference, summarizing the paper's importance, addressed problem, and contribution.
            """
            llama_response = llama_pipeline(input_text, max_length=500, do_sample=True, temperature=0.7)
            rationale = llama_response[0]['generated_text'].strip()
        except Exception as e:
            print(f"Error generating rationale for paper '{paper_name}': {e}")
            rationale = "Error generating rationale."

    rationales.append(rationale)

# Add rationales to the new DataFrame
data2['Rationale'] = rationales

# Display the updated DataFrame
print(f"Rationales generated for {len(data2)} papers.")
print(data2.head())


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
data2=data2['Paper_ID' , ['Publishable'] , ['Conference']]

In [None]:
data2.to_csv('/content/results.csv', index=False)