# Task 1 predicting publishability of papers

### PDF Data Extraction from the directory


In [2]:
import os
from PyPDF2 import PdfReader

def extract_text_from_pdfs(directory):
    paper_texts = {}
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            reader = PdfReader(pdf_path)
            text = "".join(page.extract_text() for page in reader.pages)
            paper_texts[filename] = text
    return paper_texts

# Example Usage
pdf_directory = "./Dataset/Train"
papers = extract_text_from_pdfs(pdf_directory)


### Printing the Texts from PDFs

In [3]:
papers

{'R001.pdf': 'Transdimensional Properties of Graphite in Relation\nto Cheese Consumption on Tuesday Afternoons\nAbstract\nGraphite research has led to discoveries about dolphins and their penchant for\ncollecting rare flowers, which bloom only under the light of a full moon, while\nsimultaneously revealing the secrets of dark matter and its relation to the perfect\nrecipe for chicken parmesan, as evidenced by the curious case of the missing socks\nin the laundry basket, which somehow correlates with the migration patterns of but-\nterflies and the art of playing the harmonica underwater, where the sounds produced\nare eerily similar to the whispers of ancient forests, whispering tales of forgotten\ncivilizations and their advanced understanding of quantum mechanics, applied to\nthe manufacture of sentient toasters that can recite Shakespearean sonnets, all of\nwhich is connected to the inherent properties of graphite and its ability to conduct\nthe thoughts of extraterrestrial beings, 

### Preprocessing and Embedding Generation

In [4]:
from sentence_transformers import SentenceTransformer

# Load Pre-trained Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate Embeddings
def generate_embeddings(texts):
    embeddings = {key: model.encode(value, convert_to_tensor=True) for key, value in texts.items()}
    return embeddings

# Example Usage
embeddings = generate_embeddings(papers)


  from .autonotebook import tqdm as notebook_tqdm





In [5]:
embeddings

{'R001.pdf': tensor([-9.2503e-02, -6.6373e-02,  2.4385e-02,  1.9213e-02, -1.7798e-02,
         -4.1316e-02,  1.9997e-02, -6.5262e-02,  8.9183e-03,  5.2738e-02,
         -3.0771e-02, -7.4462e-02, -1.3196e-01,  5.4801e-02,  1.1418e-02,
          3.2658e-02,  4.1190e-04, -2.1285e-03, -1.7513e-02, -2.4025e-02,
          6.4416e-02, -4.3398e-02,  3.7561e-02,  3.1216e-02,  3.3247e-02,
          9.0791e-02, -5.4712e-02, -1.1331e-02, -3.2413e-03, -2.0788e-02,
         -7.3937e-03,  5.3922e-02, -3.0311e-02,  5.1328e-03, -1.7094e-02,
          3.4007e-02,  5.8677e-02, -5.9332e-02,  9.5708e-02,  1.9097e-02,
          2.2923e-02, -5.5836e-03, -4.9878e-03,  5.8896e-02, -5.6386e-02,
          9.4374e-03, -3.7051e-02,  7.0653e-02, -4.3074e-02, -2.1805e-02,
         -7.9256e-02,  8.1529e-03, -5.5520e-02, -6.6804e-02,  5.7210e-02,
          5.7920e-02,  1.0383e-02, -2.7979e-02,  8.9349e-02, -4.9612e-02,
          8.9862e-02, -6.3234e-03,  2.3751e-02, -4.8274e-03,  6.4086e-02,
         -3.3132e-02, -6.0

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [7]:

# Example 'papers' dataset and corresponding labels
papers = papers

# Example labels for classification (e.g., categories)
labels = [0,0,0,0,0,1,1,1,1,1,1,1,1,1,1] #each number represents a different category (0 = Tech, 1 = Science, etc.)

# Convert embeddings into a list of vectors
X = list(embeddings.values())


In [8]:
X

[tensor([-9.2503e-02, -6.6373e-02,  2.4385e-02,  1.9213e-02, -1.7798e-02,
         -4.1316e-02,  1.9997e-02, -6.5262e-02,  8.9183e-03,  5.2738e-02,
         -3.0771e-02, -7.4462e-02, -1.3196e-01,  5.4801e-02,  1.1418e-02,
          3.2658e-02,  4.1190e-04, -2.1285e-03, -1.7513e-02, -2.4025e-02,
          6.4416e-02, -4.3398e-02,  3.7561e-02,  3.1216e-02,  3.3247e-02,
          9.0791e-02, -5.4712e-02, -1.1331e-02, -3.2413e-03, -2.0788e-02,
         -7.3937e-03,  5.3922e-02, -3.0311e-02,  5.1328e-03, -1.7094e-02,
          3.4007e-02,  5.8677e-02, -5.9332e-02,  9.5708e-02,  1.9097e-02,
          2.2923e-02, -5.5836e-03, -4.9878e-03,  5.8896e-02, -5.6386e-02,
          9.4374e-03, -3.7051e-02,  7.0653e-02, -4.3074e-02, -2.1805e-02,
         -7.9256e-02,  8.1529e-03, -5.5520e-02, -6.6804e-02,  5.7210e-02,
          5.7920e-02,  1.0383e-02, -2.7979e-02,  8.9349e-02, -4.9612e-02,
          8.9862e-02, -6.3234e-03,  2.3751e-02, -4.8274e-03,  6.4086e-02,
         -3.3132e-02, -6.0742e-02,  1.

### Examinig Individual Models' performances(XGBoost & SVC)

In [10]:

import xgboost as xgb
sc = StandardScaler()
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

# Normalize the data
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Initialize a classifier (e.g., Support Vector Machine)
classifier_svc = SVC(kernel='linear', C=10)
classifier_xgb = xgb.XGBClassifier()


# Train the classifier
classifier_svc.fit(X_train, y_train)
classifier_xgb.fit(X_train, y_train)

# Make predictions on the test set
y_train_pred_svc = classifier_svc.predict(X_train)
y_test_pred_svc = classifier_svc.predict(X_test)

y_train_pred_xgb = classifier_xgb.predict(X_train)
y_test_pred_xgb = classifier_xgb.predict(X_test)

# Evaluate the model
accuracy1 = accuracy_score(y_train, y_train_pred_svc)
accuracy2 = accuracy_score(y_test, y_test_pred_svc)
print(f"Classification Accuracy train/test(SVC): {accuracy1:.4f}/{accuracy2:.4f}")

accuracy3 = accuracy_score(y_train, y_train_pred_xgb)
accuracy4 = accuracy_score(y_test, y_test_pred_xgb)
print(f"Classification Accuracy train/test(XGB): {accuracy3:.4f}/{accuracy4:.4f}")


Classification Accuracy train/test(SVC): 1.0000/1.0000
Classification Accuracy train/test(XGB): 1.0000/1.0000


In [11]:
print(f"F1 Score train/test(SVC): {f1_score(y_train, y_train_pred_svc):.4f}/{f1_score(y_test, y_test_pred_svc):.4f}")
print(f"F1 Score train/test(XGB): {f1_score(y_train, y_train_pred_xgb):.4f}/{f1_score(y_test, y_test_pred_xgb):.4f}")

F1 Score train/test(SVC): 1.0000/1.0000
F1 Score train/test(XGB): 1.0000/1.0000


### GridSearch

                a)SVC

In [12]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# Initialize the classifier
clf_svc = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf_svc, param_grid=param_grid, cv=2, scoring='f1')

# Train the classifier
grid_search.fit(X_train, y_train)

best_svc = grid_search.best_estimator_

                b)RandomForetClassifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, 40, 50]
}

clf_rf = RandomForestClassifier()

grid_search_rf = GridSearchCV(estimator=clf_rf, param_grid=param_grid, cv=2, scoring='f1')

grid_search_rf.fit(X_train, y_train)

best_rf = grid_search_rf.best_estimator_

                c)Neural Network

In [14]:
from sklearn.neural_network import MLPClassifier

a = MLPClassifier()
param_grid = {
    'hidden_layer_sizes': [(100,), (200,), (300,)],
    'activation': ['identity', 'relu', 'logistic', 'tanh'],
}

grid_search_nn = GridSearchCV(estimator=a, param_grid=param_grid, cv=2, scoring='f1')

grid_search_nn.fit(X_train, y_train)

best_nn = grid_search_nn.best_estimator_

### Combining in a VotingClassifier

In [15]:
from sklearn.ensemble import VotingClassifier

# Initialize the classifiers
clf1 = best_svc
clf2 = best_rf
clf3 = best_nn

# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[('svc', clf1), ('rf', clf2), ('nn', clf3)],
    voting='hard')

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Make predictions on the test set
y_train_pred_voting = voting_clf.predict(X_train)
y_test_pred_voting = voting_clf.predict(X_test)

# Evaluate the model
accuracy1 = accuracy_score(y_train, y_train_pred_voting)
accuracy2 = accuracy_score(y_test, y_test_pred_voting)
print(f"Classification Accuracy train/test(Voting): {accuracy1:.4f}/{accuracy2:.4f}")

from sklearn.metrics import classification_report

# Print classification report
print("Classification Report (Voting Classifier):")
print(classification_report(y_test, y_test_pred_voting))

Classification Accuracy train/test(Voting): 1.0000/1.0000
Classification Report (Voting Classifier):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         4

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



### Generate prediction for Test papers

In [16]:
pdf_directory = "./Dataset/Papers"
papers = extract_text_from_pdfs(pdf_directory)

In [17]:
embeddings = generate_embeddings(papers)


In [18]:
import numpy as np
p = np.array([i for i in embeddings.keys()]).reshape(-1, 1)
test_papers = list(embeddings.values())
test_papers = sc.transform(test_papers)
# test_papers = pca.transform(test_papers)


### Prediction

In [19]:

predictions_clf = voting_clf.predict(test_papers)
# predictions_xgb = classifier_xgb.predict(test_papers)
# p1 = np.concatenate((p, predictions_svc.reshape(-1,1)), axis=1)
p = np.concatenate((p, predictions_clf.reshape(-1,1)), axis=1)

In [20]:
p

array([['P001.pdf', '1'],
       ['P002.pdf', '0'],
       ['P003.pdf', '1'],
       ['P004.pdf', '1'],
       ['P005.pdf', '1'],
       ['P006.pdf', '0'],
       ['P007.pdf', '1'],
       ['P008.pdf', '1'],
       ['P009.pdf', '1'],
       ['P010.pdf', '1'],
       ['P011.pdf', '1'],
       ['P012.pdf', '1'],
       ['P013.pdf', '1'],
       ['P014.pdf', '1'],
       ['P015.pdf', '1'],
       ['P016.pdf', '1'],
       ['P017.pdf', '1'],
       ['P018.pdf', '1'],
       ['P019.pdf', '1'],
       ['P020.pdf', '1'],
       ['P021.pdf', '1'],
       ['P022.pdf', '0'],
       ['P023.pdf', '1'],
       ['P024.pdf', '1'],
       ['P025.pdf', '1'],
       ['P026.pdf', '0'],
       ['P027.pdf', '1'],
       ['P028.pdf', '1'],
       ['P029.pdf', '1'],
       ['P030.pdf', '1'],
       ['P031.pdf', '1'],
       ['P032.pdf', '0'],
       ['P033.pdf', '1'],
       ['P034.pdf', '1'],
       ['P035.pdf', '1'],
       ['P036.pdf', '0'],
       ['P037.pdf', '1'],
       ['P038.pdf', '0'],
       ['P03

### Sorting out the publishable papers for the *Task 2*

In [24]:
index_1 = [True if p[i, 1] == '1' else False for i in range(len(p))]

print(len(index_1))
print(index_1)

p_publishable = p[index_1]

135
[True, False, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, False, True, True, True, True, True, False, True, True, True, False, True, False, False, True, False, True, False, True, True, True, False, False, True, True, True, True, False, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, False, True, False, True, True, False, True, True, True, True, False, True, True, True, True, True, True, True, False, True, False, False, False, True, False, True, True, True, True, False, True, True, True, True, True, True, True, True, False, True, True, True, True, False, True, True, True, True, True, True, True, True, True, False, True, True, True, True, False, True]


In [26]:
p_publishable

array([['P001.pdf', '1'],
       ['P003.pdf', '1'],
       ['P004.pdf', '1'],
       ['P005.pdf', '1'],
       ['P007.pdf', '1'],
       ['P008.pdf', '1'],
       ['P009.pdf', '1'],
       ['P010.pdf', '1'],
       ['P011.pdf', '1'],
       ['P012.pdf', '1'],
       ['P013.pdf', '1'],
       ['P014.pdf', '1'],
       ['P015.pdf', '1'],
       ['P016.pdf', '1'],
       ['P017.pdf', '1'],
       ['P018.pdf', '1'],
       ['P019.pdf', '1'],
       ['P020.pdf', '1'],
       ['P021.pdf', '1'],
       ['P023.pdf', '1'],
       ['P024.pdf', '1'],
       ['P025.pdf', '1'],
       ['P027.pdf', '1'],
       ['P028.pdf', '1'],
       ['P029.pdf', '1'],
       ['P030.pdf', '1'],
       ['P031.pdf', '1'],
       ['P033.pdf', '1'],
       ['P034.pdf', '1'],
       ['P035.pdf', '1'],
       ['P037.pdf', '1'],
       ['P040.pdf', '1'],
       ['P042.pdf', '1'],
       ['P044.pdf', '1'],
       ['P045.pdf', '1'],
       ['P046.pdf', '1'],
       ['P049.pdf', '1'],
       ['P050.pdf', '1'],
       ['P05

In [27]:
task_2_data = [embeddings[key] for key in p_publishable[:, 0]]

In [29]:
len(task_2_data)

106

In [30]:
task_2_data

[tensor([ 6.5142e-02, -6.0072e-02, -4.0439e-02, -1.6616e-02,  1.2649e-01,
         -9.9796e-03,  8.5635e-02, -1.5612e-01,  6.0963e-02,  3.7569e-02,
          6.0542e-02, -8.8341e-03,  5.7360e-03,  1.3812e-02, -2.7283e-02,
         -8.5253e-03,  4.9679e-02,  3.8285e-02,  1.4050e-02,  5.9644e-02,
          1.3557e-03,  6.4241e-02,  4.9164e-02,  1.5774e-01, -3.0661e-02,
          1.1610e-01,  4.2304e-02,  8.5001e-03, -1.0400e-01, -3.3234e-02,
          8.6490e-02,  1.4310e-02, -1.1302e-02,  1.8762e-02,  1.8514e-02,
         -1.5397e-02,  1.2240e-02, -9.1556e-03, -1.8046e-02,  2.3114e-02,
          9.3080e-03,  1.0647e-02,  5.4799e-02, -1.6336e-02,  3.1754e-02,
          8.7522e-02,  6.6456e-03, -4.4311e-02,  7.1369e-02, -3.3341e-03,
         -5.1928e-02, -5.0255e-02, -1.2300e-02,  1.4918e-02,  3.1931e-02,
          4.5066e-02, -4.5985e-02, -4.3673e-02,  6.4325e-02, -8.7497e-02,
          5.1939e-02, -1.2300e-02,  4.2013e-02,  3.9701e-03, -7.1178e-02,
         -5.3118e-02, -3.5860e-02,  1.

In [69]:
import pandas as pd

p = pd.DataFrame(p, columns=['Paper', 'Category'])

In [70]:
p

Unnamed: 0,Paper,Category
0,P001.pdf,1
1,P002.pdf,0
2,P003.pdf,1
3,P004.pdf,1
4,P005.pdf,1
...,...,...
130,P131.pdf,1
131,P132.pdf,1
132,P133.pdf,1
133,P134.pdf,0


Generate CSV file

In [71]:
outfile_path = "/mnt/c/Users/jbsch/OneDrive/Desktop/Python_programs/KDSH_2025/Prediction-Pioneer-2025/Dataset/output.csv"
p.to_csv(outfile_path, index=False)

****************************************** End of TASK-1 ***************************************************