In [1]:
import pandas as pd
import csv
import tqdm
import json
import re
import ast
import os
import torch

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')
if os.path.exists('./data/cluster_filtering.csv'):
    embeddings_dataset = pd.read_csv('./data/cluster_filtering.csv')
else:
    embeddings_dataset = pd.read_csv('./data/mini_dataset_v3.csv')
    embeddings_dataset['cluster_filter'] = None
    i = 0
    same = 0
    total = 0
    for index, row in tqdm.tqdm(embeddings_dataset.iterrows(), total=embeddings_dataset.shape[0]):
        embeddings = []
        title_embedding = model.encode(row['SubmissionTitle'])
        abstract_embedding = model.encode(row['SubmissionAbstract'])
        s_id = row['SubmissionID']

        
        embeddings.append(np.concatenate((title_embedding, abstract_embedding), axis=None))
        
        row['authorPublicationHistory_embedding'] = ast.literal_eval(row['authorPublicationHistory_embedding'])
        embeddings_dataset.at[index, 'authorPublicationHistory_embedding'] = row['authorPublicationHistory_embedding']
        
        row['authorPublicationHistory'] = ast.literal_eval(row['authorPublicationHistory'])
        embeddings_dataset.at[index, 'authorPublicationHistory'] = row['authorPublicationHistory']
        for authorWorks in row['authorPublicationHistory']:
            authorTitleEmbedding = model.encode(authorWorks['title'])
            authorAbstractEmbedding = model.encode(authorWorks['abstract'])
            
            embeddings.append(np.concatenate((authorTitleEmbedding, authorAbstractEmbedding), axis=None))            
            
        if len(embeddings) == 1:
            k = 0
        if len(embeddings) == 2:
            k = 1
        if len(embeddings) >= 3:
            k = 2
        
        kmeans = KMeans(n_clusters=k, random_state=42)
        clusters = kmeans.fit_predict(embeddings)
        
        submission_cluster = clusters[0]
        clusters = clusters[1:]
        clusters_authors = []
        for i in range(len(clusters)):
            if clusters[i] == submission_cluster:
                clusters_authors.append(row['authorPublicationHistory'][i])
                if row['authorPublicationHistory'][i]['doi'] == row['doi']:
                    same += 1
        
        embeddings_dataset.at[index, 'cluster_filter'] = clusters_authors
        #all_embeddings.append(embeddings)
        
        i += 1
        total += 1

embeddings_dataset.to_csv('data/cluster_filtering.csv', index=False)
print(embeddings_dataset.head())
print(embeddings_dataset.info())

[WinError 2] The system cannot find the file specified
  File "c:\Users\hsawhney\anaconda3\envs\ml-project-2\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\hsawhney\anaconda3\envs\ml-project-2\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\hsawhney\anaconda3\envs\ml-project-2\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\hsawhney\anaconda3\envs\ml-project-2\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
100%|██████████| 50/50 [00:23<00:00,  2.09it/s]


                 SubmissionID  SubmissionYear  \
0             /10.1101/188524            2017   
1             /10.1101/188524            2017   
2             /10.1101/783175            2019   
3  /10.1101/2020.06.30.176537            2020   
4  /10.1101/2020.06.30.176537            2020   

                                     SubmissionTitle  \
0  characterizing highly dynamic conformational s...   
1  characterizing highly dynamic conformational s...   
2  dynamic reconfiguration fragmentation and inte...   
3  attenuated subcomponent vaccine design targeti...   
4  attenuated subcomponent vaccine design targeti...   

                                  SubmissionAbstract firstName middleName  \
0  biomacromolecules carry out complicated functi...     eitan        NaN   
1  biomacromolecules carry out complicated functi...  antonino        NaN   
2  general anesthetics are routinely used to indu...    corson         N.   
3  the novel coronavirus disease covid19 caused b...    onye

In [11]:
dataset = pd.read_csv('data/cluster_filtering.csv')
# Compute the TF-IDF matrix
vectorizer = TfidfVectorizer()
dataset['tfidf_cosine_similarity'] = None
# Combine all texts for fitting the vectorizer
all_texts = []
for index, row in tqdm.tqdm(dataset.iterrows(), total=dataset.shape[0]):
    all_texts.append(row['SubmissionTitle'])
    all_texts.append(row['SubmissionAbstract'])
    row['cluster_filter'] = ast.literal_eval(row['cluster_filter'])
    for authorWorks in row['cluster_filter']:
        all_texts.append(authorWorks['title'])
        all_texts.append(authorWorks['abstract'])

vectorizer.fit(all_texts)

for index, row in tqdm.tqdm(dataset.iterrows(), total=dataset.shape[0]):
    all_cosine_similarities = {}
    doi = row['doi']
    # Transform the title and abstract
    titletfidf = vectorizer.transform([row['SubmissionTitle']])
    abstracttfidf = vectorizer.transform([row['SubmissionAbstract']])
    
    row['cluster_filter'] = ast.literal_eval(row['cluster_filter'])
    dataset.at[index, 'cluster_filter'] = row['cluster_filter']
    
    for authorWorks in row['cluster_filter']:
        authDOI = authorWorks['doi']
        authorTitleTfidf = vectorizer.transform([authorWorks['title']])
        authorAbstractTfidf = vectorizer.transform([authorWorks['abstract']])
        
        # Compute the cosine similarity
        title_cosine_similarity = cosine_similarity(titletfidf, authorTitleTfidf)
        abstract_cosine_similarity = cosine_similarity(abstracttfidf, authorAbstractTfidf)
        
        all_cosine_similarities[authDOI] = 0.3 * title_cosine_similarity + 0.7 * abstract_cosine_similarity

    # Keep the top k most similar works
    if len(all_cosine_similarities) == 1:
        k = 1
    elif len(all_cosine_similarities) < 4:
        k = 2
    else:
        k = 3
    
    # Sort the list of cosine similarities
    all_cosine_similarities = dict(sorted(all_cosine_similarities.items(), key=lambda item: item[1], reverse=True))
    # Get the top k most similar works
    top_k = dict(list(all_cosine_similarities.items())[:k])
    
    # get the auth objects of the top k
    top_k_auths = []
    for key in top_k:
        for auth in row['cluster_filter']:
            if auth['doi'] == key:
                top_k_auths.append(auth)
                break
    
    dataset.at[index, 'tfidf_cosine_similarity'] = top_k_auths
dataset.to_csv('data/tfidf_cosine_similarity.csv', index=False)
print(dataset.head())
print(dataset.info())

100%|██████████| 50/50 [00:00<00:00, 1596.31it/s]
100%|██████████| 50/50 [00:00<00:00, 64.52it/s]


                 SubmissionID  SubmissionYear  \
0             /10.1101/188524            2017   
1             /10.1101/188524            2017   
2             /10.1101/783175            2019   
3  /10.1101/2020.06.30.176537            2020   
4  /10.1101/2020.06.30.176537            2020   

                                     SubmissionTitle  \
0  characterizing highly dynamic conformational s...   
1  characterizing highly dynamic conformational s...   
2  dynamic reconfiguration fragmentation and inte...   
3  attenuated subcomponent vaccine design targeti...   
4  attenuated subcomponent vaccine design targeti...   

                                  SubmissionAbstract firstName middleName  \
0  biomacromolecules carry out complicated functi...     eitan        NaN   
1  biomacromolecules carry out complicated functi...  antonino        NaN   
2  general anesthetics are routinely used to indu...    corson         N.   
3  the novel coronavirus disease covid19 caused b...    onye

In [16]:
model = SentenceTransformer('all-mpnet-base-v2')
dataset['predicted_published_work'] = None
for index, row in tqdm.tqdm(dataset.iterrows(), total=dataset.shape[0]):
    title_embedding = model.encode(row['SubmissionTitle'])
    abstract_embedding = model.encode(row['SubmissionAbstract'])
    doi = row['doi']
    # row['tfidf_cosine_similarity'] = ast.literal_eval(row['tfidf_cosine_similarity'])
    # dataset.at[index, 'tfidf_cosine_similarity'] = row['tfidf_cosine_similarity']
    
    author_all_works_similarity = {}
    for authorWorks in row['tfidf_cosine_similarity']:
        authorTitleEmbedding = model.encode(authorWorks['title'])
        authorAbstractEmbedding = model.encode(authorWorks['abstract'])
        
        # Convert NumPy arrays to PyTorch tensors and add an extra dimension
        title_embedding_tensor = torch.tensor(title_embedding).unsqueeze(0)
        authorTitleEmbedding_tensor = torch.tensor(authorTitleEmbedding).unsqueeze(0)
        abstract_embedding_tensor = torch.tensor(abstract_embedding).unsqueeze(0)
        authorAbstractEmbedding_tensor = torch.tensor(authorAbstractEmbedding).unsqueeze(0)
        
        # Compute the cosine similarity
        title_cosine_similarity = torch.nn.functional.cosine_similarity(title_embedding_tensor, authorTitleEmbedding_tensor)
        abstract_cosine_similarity = torch.nn.functional.cosine_similarity(abstract_embedding_tensor, authorAbstractEmbedding_tensor)
        author_all_works_similarity[authorWorks['doi']] = 0.3 * title_cosine_similarity.item() + 0.7 * abstract_cosine_similarity.item()
        
    # Sort the list of cosine similarities
    author_all_works_similarity = dict(sorted(author_all_works_similarity.items(), key=lambda item: item[1], reverse=True))
    # Get the top most similar work
    top_work = list(author_all_works_similarity.items())[0]
    dataset.at[index, 'predicted_published_work'] = top_work[0]

dataset.to_csv('data/predicted_published_work.csv', index=False)
print(dataset.head())
print(dataset.info())

100%|██████████| 50/50 [00:06<00:00,  7.51it/s]


                 SubmissionID  SubmissionYear  \
0             /10.1101/188524            2017   
1             /10.1101/188524            2017   
2             /10.1101/783175            2019   
3  /10.1101/2020.06.30.176537            2020   
4  /10.1101/2020.06.30.176537            2020   

                                     SubmissionTitle  \
0  characterizing highly dynamic conformational s...   
1  characterizing highly dynamic conformational s...   
2  dynamic reconfiguration fragmentation and inte...   
3  attenuated subcomponent vaccine design targeti...   
4  attenuated subcomponent vaccine design targeti...   

                                  SubmissionAbstract firstName middleName  \
0  biomacromolecules carry out complicated functi...     eitan        NaN   
1  biomacromolecules carry out complicated functi...  antonino        NaN   
2  general anesthetics are routinely used to indu...    corson         N.   
3  the novel coronavirus disease covid19 caused b...    onye

In [17]:
y_true = dataset['doi']
y_pred = dataset['predicted_published_work']


accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')

print("Evaluation Metrics for Our Implementation")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Evaluation Metrics for Our Implementation
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
