In [1]:
pip install python-docx


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from PyPDF2 import PdfReader
from docx import Document
from autogluon.tabular import TabularPredictor
from sklearn.metrics import classification_report

# Function to extract text from PDFs
def extract_text_from_pdfs(folder_path):
    data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.pdf'):
                file_path = os.path.join(root, file)
                reader = PdfReader(file_path)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                # Extract category and conference from folder structure
                parts = root.split(os.sep)
                if len(parts) >= 2:
                    publishable_status = parts[-2]
                    conference = parts[-1] if publishable_status == "Publishable" else "Non-Publishable"
                else:
                    publishable_status = "Unknown"
                    conference = "Unknown"
                paper_id = os.path.splitext(file)[0]
                is_publishable = 1 if publishable_status == "Publishable" else 0
                data.append({'Paper ID': paper_id, 'Publishable': is_publishable, 'Conference': conference, 'Rationale': text})
    return pd.DataFrame(data)

# Function to extract text from DOCX files
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Path to the folder containing PDFs and DOCX files
pdf_folder_path = r"C:\Users\harsh\OneDrive\Desktop\hack iitk\KDSH_2025_Dataset-20250106T145006Z-001\KDSH_2025_Dataset"

# Extract text and create dataset
data = extract_text_from_pdfs(pdf_folder_path)

# Split the data into training and testing sets
X = data[['Rationale']]
y = data[['Publishable', 'Conference']]

# Create a train-test split (stratify on 'Publishable' as it is the binary target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y['Publishable'])

# Combine the target variables into the training set for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Train a model with AutoGluon for 'Publishable'
publishable_predictor = TabularPredictor(label='Publishable').fit(train_data)

# Train a model with AutoGluon for 'Conference'
conference_predictor = TabularPredictor(label='Conference').fit(train_data)

# Function to prepare data for prediction
def prepare_data_for_prediction(df, predictor):
    """Ensure the input DataFrame matches the expected columns."""
    expected_columns = predictor.feature_metadata_in.get_features()
    # Add missing columns with default values
    for col in expected_columns:
        if col not in df.columns:
            df.loc[:, col] = None  # Use .loc to avoid SettingWithCopyWarning
    return df[expected_columns]

# Prepare data for predictions
X_test_publishable = prepare_data_for_prediction(test_data[['Rationale']].copy(), publishable_predictor)
X_test_conference = prepare_data_for_prediction(test_data[['Rationale']].copy(), conference_predictor)

# Make predictions
y_pred_publishable = publishable_predictor.predict(X_test_publishable)
y_pred_conference = conference_predictor.predict(X_test_conference)

# Evaluation
print("Publishability Classification Report:")
print(classification_report(test_data['Publishable'], y_pred_publishable))

print("Conference Classification Report:")
print(classification_report(test_data['Conference'], y_pred_conference))

# Function for inference on a single document (PDF or DOCX)
def classify_document(file_path):
    # Check the file extension and process accordingly
    file_extension = os.path.splitext(file_path)[1].lower()
    
    if file_extension == '.pdf':
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    elif file_extension == '.docx':
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type: Only PDF and DOCX are supported.")
    
    # Create a DataFrame with the text for prediction
    sample_df = pd.DataFrame({'Rationale': [text]})
    
    # Prepare data for predictions
    sample_df_publishable = prepare_data_for_prediction(sample_df.copy(), publishable_predictor)
    sample_df_conference = prepare_data_for_prediction(sample_df.copy(), conference_predictor)
    
    # Predict publishability
    publishable_prediction = publishable_predictor.predict(sample_df_publishable)
    
    if publishable_prediction.iloc[0] == 1:  # If it is publishable, predict the conference
        conference_prediction = conference_predictor.predict(sample_df_conference)
        return f"Publishable, Conference: {conference_prediction.iloc[0]}"
    else:
        return "Non-Publishable"

# Example usage for a single document (PDF or DOCX)
example_document_path = r"C:\Users\harsh\OneDrive\Desktop\paper\14.pdf"
result = classify_document(example_document_path)
print("Classification Result for Example Document:", result)


No path specified. Models will be saved in: "AutogluonModels\ag-20250114_060234"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       3.22 GB / 15.69 GB (20.5%)
Disk Space Avail:   451.92 GB / 815.10 GB (55.4%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	preset

Publishability Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.68      0.79        28
           1       0.10      0.50      0.17         2

    accuracy                           0.67        30
   macro avg       0.53      0.59      0.48        30
weighted avg       0.89      0.67      0.75        30

Conference Classification Report:
                 precision    recall  f1-score   support

           CVPR       0.00      0.00      0.00         1
          EMNLP       0.00      0.00      0.00         1
Non-Publishable       0.93      1.00      0.97        28

       accuracy                           0.93        30
      macro avg       0.31      0.33      0.32        30
   weighted avg       0.87      0.93      0.90        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Result for Example Document: Publishable, Conference: Non-Publishable
