# Structuring and Cleaning data

### Import required libraries

In [61]:
import pandas as pd
import re
from datetime import datetime
import string
import json
import nltk

In [62]:
try:
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))

### Load data

In [63]:
with open('data/abstracts_raw.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)
print(df.shape)
df.head

(30, 12)


<bound method NDFrame.head of                         rel_doi  \
0     10.1101/2025.10.16.682669   
1     10.1101/2025.10.16.682991   
2     10.1101/2025.10.15.682635   
3     10.1101/2025.10.17.682775   
4     10.1101/2025.10.17.683077   
5     10.1101/2025.10.15.682549   
6     10.1101/2025.10.12.681938   
7     10.1101/2025.10.11.681833   
8     10.1101/2025.10.13.682008   
9     10.1101/2025.10.13.682101   
10    10.1101/2025.10.13.682000   
11    10.1101/2025.10.13.681985   
12    10.1101/2025.10.09.681538   
13    10.1101/2025.10.09.680815   
14    10.1101/2025.10.09.681330   
15    10.1101/2025.10.09.681417   
16  10.1101/2025.10.07.25337492   
17  10.1101/2025.10.07.25337497   
18  10.1101/2025.10.07.25337449   
19    10.1101/2025.09.30.679586   
20  10.1101/2025.10.05.25337392   
21  10.1101/2025.10.02.25337223   
22    10.1101/2025.10.03.679858   
23    10.1101/2025.10.03.680419   
24    10.1101/2025.10.03.680220   
25  10.1101/2025.09.27.25336412   
26  10.1101/2025.10.02.25

### Build nodes and edges for knowledge graph

In [64]:
nodes = []

for _, row in df.iterrows():
    doi = row['rel_doi']
    title = row['rel_title']
    site = row.get('rel_site', '')
    date = row.get('rel_date', '')

    nodes.append({
        'id': doi,
        'label': 'Article',
        'title': title,
        'site': site,
        'date': date
    })


#### Dropping duplicates(if any)

In [65]:
nodes_df = pd.DataFrame(nodes).drop_duplicates(subset=['id'])

print(f"Nodes: {len(nodes_df)}")

Nodes: 30


### Export to csv

In [66]:
nodes_df.to_csv('data/entities_extracted.csv', index=False)