# Caluclating Metrics from our linking

todo:
calucalting simple metrics 
    - how many papers got linked to a technology
    - how many papers per technology average
    - how often one paper gets linked on average

fixing abstract generation: 
    creata a list of the papers that havent been fetched yet, count that and then only fetch for them
    

In [5]:
import os 
import pandas as pd
import requests
import json

In [7]:
# import linking data
links_df = pd.read_csv('../data/linking-data/paper_technology_links.csv')
links_df['paper_id'] = links_df['paper_id'].str.replace('https://openalex.org/', '')
paper_ids = links_df['paper_id'].tolist()

In [None]:



url = "https://api.openalex.org/works"
mail = os.getenv("MAIL")



for paper in paper_ids:
    request_url = f"{url}/{paper}"
    params = {
        "mailto": mail,
        "select": "abstract_inverted_index"
    }
    response = requests.get(request_url, params=params)
    if response.status_code == 200:
        data = response.json()
        if 'abstract_inverted_index' in data:
            abstract_inverted_index = data['abstract_inverted_index']
            with open(f"../data/linking-data/abstracts/{paper}.json", "w") as f:
                json.dump(abstract_inverted_index, f)
        
    else:
        print(f"Failed to retrieve data for paper {paper}: {response.status_code}")
        

In [10]:
# how many papers without abstract 
import glob

null_abstracts = 0
for file in glob.glob("../data/linking-data/abstracts/*.json"):
    with open(file) as f:
        content = json.load(f)
        if content is None:
            null_abstracts += 1
            
print(f"Total number of papers: {len(links_df)}")
print(f"Number of papers without abstract: {null_abstracts}")

# Remove papers with null abstracts from links_df
papers_with_abstracts = []
for file in glob.glob("../data/linking-data/abstracts/*.json"):
    with open(file) as f:
        content = json.load(f)
        if content is not None:
            paper_id = os.path.basename(file).replace('.json', '')
            papers_with_abstracts.append(paper_id)

links_df = links_df[links_df['paper_id'].isin(papers_with_abstracts)]
print(f"Number of papers after removing null abstracts: {len(links_df)}")



Total number of papers: 5150
Number of papers without abstract: 1649
Number of papers after removing null abstracts: 3491


There are 5150 papers that got linkek. 1649 do not have an abstract which we want to use to caluclate metrics for clustering. This will be a limitation for our metric. We will still calculate the Silhouette Score and see what we get

In [8]:
# convert abstracts to text
from tqdm import tqdm

def inv_index(abstraced_index):
    pos2word = {}
    for w, ps in abstraced_index.items():
            for p in ps:
                pos2word[p] = w
    return " ".join(pos2word[i] for i in sorted(pos2word))

# convert all abstracts to text

abstracts_data = []
for file in tqdm(glob.glob("../data/linking-data/abstracts/*.json")):
    with open(file) as f:
        content = json.load(f)
        if content is not None:
            paper_id = os.path.basename(file).replace('.json', '')
            abstract_text = inv_index(content)
            abstracts_data.append([paper_id, abstract_text])

abstracts_df = pd.DataFrame(abstracts_data, columns=['paper_id', 'abstract'])
abstracts_df.to_csv('../data/linking-data/abstracts.csv', index=False)


  0%|          | 0/5096 [00:00<?, ?it/s]

100%|██████████| 5096/5096 [00:02<00:00, 1819.44it/s]


In [9]:
# add technologies to csv
# merge abstracts with technologies
abstracts_df = pd.read_csv('../data/linking-data/abstracts.csv')
merged_df = pd.merge(abstracts_df, links_df, on='paper_id', how='inner')


In [13]:
# generate embeddings for each abstract
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np
from sentence_transformers import SentenceTransformer

merged_df = merged_df[merged_df['abstract'].notnull()]
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
embeddings = model.encode(merged_df['abstract'].tolist(), show_progress_bar=True, batch_size=16)

merged_df['tech_code'] = merged_df['technology_name'].astype('category').cat.codes
labels = merged_df['tech_code'].tolist()


Batches:   0%|          | 0/216 [00:00<?, ?it/s]

Now that we have the embeddings we can calculate some metrics

In [15]:
overall_score = silhouette_score(embeddings, labels)
print(f"Overall silhouette score: {overall_score:.4f}")

sample_sil_vals = silhouette_samples(embeddings, labels)
merged_df['silhouette'] = sample_sil_vals

avg_sil_by_cluster = merged_df.groupby('technology_name')['silhouette'].mean().sort_values()
print("\nAverage silhouette by technology:")
print(avg_sil_by_cluster)




Overall silhouette score: -0.0612

Average silhouette by technology:
technology_name
3D-Printed Houses Using Local Materials   -0.131299
Immersive-Reality Technologies            -0.129428
Humanoid Working Robots                   -0.128235
The IoT                                   -0.127495
Large Action Models                       -0.124081
                                             ...   
Cybersecurity Mesh Architecture            0.068670
Directed Energy                            0.119186
Advanced Batteries                         0.127111
Metaverse                                  0.127115
6G                                         0.205682
Name: silhouette, Length: 70, dtype: float32
