In [21]:
import pandas as pd
import requests
import json
from tqdm import tqdm
import datetime as dt
from time import sleep
import seaborn as sns
import matplotlib.pyplot as plt
from bertopic import BERTopic
from ydata_profiling import ProfileReport

## Retrieve articles from SpringerNature APIs

In [8]:
def get_articles(start):
    base_url = "http://api.springernature.com/meta/v2/json"

    parameters = {
    "q":'keyword:COVID-19 onlinedatefrom:2022-07-01 onlinedateto:2022-12-31 openaccess:true language:en type:Journal',
    "p":100,
    "s":start,
    "api_key" : "7f0d42dbadd8d07fccff6f917ae9b03b"
    }
    response = requests.get(base_url, params=parameters)
    try:
        content = json.loads(response.content)
        records = content["records"]
    except:
        records = []
    return records

In [39]:
records = []
for start in tqdm(range(1, 2023, 100)):
    records += get_articles(start)
    sleep(0.3)

100%|██████████| 21/21 [00:51<00:00,  2.43s/it]


In [40]:
len(records)

2023

In [41]:
with open("../data/raw/cortoviz_evaluation/articles.json", "w") as json_dump:
    json.dump(records, json_dump)

In [42]:
df = pd.read_json(json.dumps(records))
df.head()

Unnamed: 0,contentType,identifier,language,url,title,creators,publicationName,openaccess,doi,publisher,...,journalId,printDate,onlineDate,coverDate,copyright,abstract,conferenceInfo,keyword,subjects,disciplines
0,Article,doi:10.1007/s11116-022-10329-1,en,"[{'format': 'html', 'platform': 'web', 'value'...",A bifurcation of the peak: new patterns of tra...,"[{'ORCID': '0000-0002-7247-2563', 'creator': '...",Transportation,True,10.1007/s11116-022-10329-1,Springer,...,11116,2024-03-06,2022-09-09,2024-04,©2022 The Author(s),This paper analyzes the emergence of two well-...,[],"[COVID-19, Traffic flow, Diurnal curve, Mornin...","[Economics, Regional/Spatial Science, Economic...","[{'id': '4667', 'term': 'Regional and Spatial ..."
1,Article,doi:10.1007/s00508-022-02127-7,en,"[{'format': 'html', 'platform': 'web', 'value'...",Ethics teaching in medical school: the percept...,"[{'ORCID': '0000-0002-1824-0024', 'creator': '...",Wiener klinische Wochenschrift,True,10.1007/s00508-022-02127-7,Springer,...,508,2024-03-13,2022-12-22,2024-03,©2022 The Author(s),"Background In times of a pandemic, morals and ...",[],"[Medical ethics, Healthcare ethics, Ethics edu...","[Medicine & Public Health, Medicine/Public Hea...","[{'id': '2977', 'term': 'Public Health'}, {'id..."
2,Article,doi:10.1007/s40653-022-00509-7,en,"[{'format': 'html', 'platform': 'web', 'value'...",The Effect of School Bullying on Pupils’ Perce...,"[{'creator': 'Kirkham, Elizabeth J.'}, {'creat...",Journal of Child & Adolescent Trauma,True,10.1007/s40653-022-00509-7,Springer,...,40653,2024-03-13,2022-12-22,2024-03,©2022 The Author(s),Purpose: Establishing how the Covid-19 pandemi...,[],"[Bullying, Covid-19, Adolescents, School, Lock...","[Psychology, Child and School Psychology, Soci...","[{'id': '7646', 'term': 'School Psychology'}, ..."
3,Article,doi:10.1007/s11109-022-09842-x,en,"[{'format': 'html', 'platform': 'web', 'value'...",The Unequal Effects of the COVID-19 Pandemic o...,"[{'ORCID': '0000-0003-0184-0055', 'creator': '...",Political Behavior,True,10.1007/s11109-022-09842-x,Springer,...,11109,2024-02-05,2022-12-31,2024-03,©2022 The Author(s),The COVID-19 pandemic is viewed by many as the...,[],"[Political representation, Political behavior,...",[Political Science and International Relations...,"[{'id': '3115', 'term': 'Political Science'}, ..."
4,Article,doi:10.1007/s10940-022-09564-7,en,"[{'format': 'html', 'platform': 'web', 'value'...",The Effect of COVID-19 Restrictions on Routine...,"[{'ORCID': '0000-0002-0184-9896', 'creator': '...",Journal of Quantitative Criminology,True,10.1007/s10940-022-09564-7,Springer,...,10940,2024-02-28,2022-12-08,2024-03,©2022 The Author(s),Objectives Routine activity theory suggests th...,[],"[Online fraud, Online hacking, Doorstep fraud,...","[Criminology and Criminal Justice, Criminology...","[{'id': '2945', 'term': 'Criminology'}, {'id':..."


In [43]:
df.to_csv("../data/raw/cortoviz_evaluation/articles.csv",header=True)

## Topic inference

In [23]:
topic_model = BERTopic.load("../models/BERTopic_full_2023-04-18", embedding_model="pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

In [44]:
df["full"] = df.title + ". " + df.abstract

In [45]:
predicted_topics = topic_model.transform(df.full)

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

In [47]:
df["topic_id"] = predicted_topics[0]

In [48]:
df["topic_prob"] = predicted_topics[1]

In [51]:
df[df.topic_id != -1]

(555, 35)

In [None]:
df["topic_info"] 

In [76]:
def print_info(position):
    predicted = predicted_topics[0][position]
    print("----")
    print(df.full.loc[position])
    print(topic_model.get_topic_info(predicted).Name[0])
    print(topic_model.get_topic_info(predicted).Count[0])
    print(topic_model.get_topic(predicted))

In [77]:
print_info(11)

----
Link between structural risk factors for adverse impacts of COVID-19 and food insecurity in developed and developing countries. COVID-19 has had serious consequences for world food security; lockdowns and social distancing have led to changes in global food value chains, primarily affecting the poorest of the planet. The aim of this research is to analyse the relationship between food insecurity and the structural risk factors for adverse impacts of COVID-19. To that end, 12 contingency tables are constructed to identify the association between the pillars of the food insecurity index and the INFORM COVID-19 Risk Index. We use the Gamma coefficient as a measure of association. In addition, this paper proposes a synthetic index produced by applying the TOPSIS method, using the pillars of the two aforementioned indices (criteria) to establish a ranking of 112 countries (alternatives) ordered from highest to lowest risk faced in the key year of the pandemic, 2020. The results show th

In [78]:
df["topic_name"] = df.topic_id.map(lambda x: topic_model.get_topic_info(x).Name[0])
df["topic_count"] = df.topic_id.map(lambda x: topic_model.get_topic_info(x).Count[0])
df["topic_terms"] = df.topic_id.map(lambda x: topic_model.get_topic(x))

In [79]:
df.head()

Unnamed: 0,contentType,identifier,language,url,title,creators,publicationName,openaccess,doi,publisher,...,conferenceInfo,keyword,subjects,disciplines,full,topic_id,topic_prob,topic_name,topic_count,topic_terms
0,Article,doi:10.1007/s11116-022-10329-1,en,"[{'format': 'html', 'platform': 'web', 'value'...",A bifurcation of the peak: new patterns of tra...,"[{'ORCID': '0000-0002-7247-2563', 'creator': '...",Transportation,True,10.1007/s11116-022-10329-1,Springer,...,[],"[COVID-19, Traffic flow, Diurnal curve, Mornin...","[Economics, Regional/Spatial Science, Economic...","[{'id': '4667', 'term': 'Regional and Spatial ...",A bifurcation of the peak: new patterns of tra...,-1,0.0,-1_cohort_vaccines_epidemic_pneumonia,247546,"[(cohort, 0.3572387), (vaccines, 0.3052731), (..."
1,Article,doi:10.1007/s00508-022-02127-7,en,"[{'format': 'html', 'platform': 'web', 'value'...",Ethics teaching in medical school: the percept...,"[{'ORCID': '0000-0002-1824-0024', 'creator': '...",Wiener klinische Wochenschrift,True,10.1007/s00508-022-02127-7,Springer,...,[],"[Medical ethics, Healthcare ethics, Ethics edu...","[Medicine & Public Health, Medicine/Public Hea...","[{'id': '2977', 'term': 'Public Health'}, {'id...",Ethics teaching in medical school: the percept...,-1,0.0,-1_cohort_vaccines_epidemic_pneumonia,247546,"[(cohort, 0.3572387), (vaccines, 0.3052731), (..."
2,Article,doi:10.1007/s40653-022-00509-7,en,"[{'format': 'html', 'platform': 'web', 'value'...",The Effect of School Bullying on Pupils’ Perce...,"[{'creator': 'Kirkham, Elizabeth J.'}, {'creat...",Journal of Child & Adolescent Trauma,True,10.1007/s40653-022-00509-7,Springer,...,[],"[Bullying, Covid-19, Adolescents, School, Lock...","[Psychology, Child and School Psychology, Soci...","[{'id': '7646', 'term': 'School Psychology'}, ...",The Effect of School Bullying on Pupils’ Perce...,-1,0.0,-1_cohort_vaccines_epidemic_pneumonia,247546,"[(cohort, 0.3572387), (vaccines, 0.3052731), (..."
3,Article,doi:10.1007/s11109-022-09842-x,en,"[{'format': 'html', 'platform': 'web', 'value'...",The Unequal Effects of the COVID-19 Pandemic o...,"[{'ORCID': '0000-0003-0184-0055', 'creator': '...",Political Behavior,True,10.1007/s11109-022-09842-x,Springer,...,[],"[Political representation, Political behavior,...",[Political Science and International Relations...,"[{'id': '3115', 'term': 'Political Science'}, ...",The Unequal Effects of the COVID-19 Pandemic o...,-1,0.0,-1_cohort_vaccines_epidemic_pneumonia,247546,"[(cohort, 0.3572387), (vaccines, 0.3052731), (..."
4,Article,doi:10.1007/s10940-022-09564-7,en,"[{'format': 'html', 'platform': 'web', 'value'...",The Effect of COVID-19 Restrictions on Routine...,"[{'ORCID': '0000-0002-0184-9896', 'creator': '...",Journal of Quantitative Criminology,True,10.1007/s10940-022-09564-7,Springer,...,[],"[Online fraud, Online hacking, Doorstep fraud,...","[Criminology and Criminal Justice, Criminology...","[{'id': '2945', 'term': 'Criminology'}, {'id':...",The Effect of COVID-19 Restrictions on Routine...,-1,0.0,-1_cohort_vaccines_epidemic_pneumonia,247546,"[(cohort, 0.3572387), (vaccines, 0.3052731), (..."


In [85]:
df["url_web"] = df.url.map(lambda x: x[0]["value"])

In [90]:
df.to_csv("../data/interim/cortoviz_evaluation/articles.csv",header=True)

In [82]:
df.columns

Index(['contentType', 'identifier', 'language', 'url', 'title', 'creators',
       'publicationName', 'openaccess', 'doi', 'publisher', 'publisherName',
       'publicationDate', 'publicationType', 'issn', 'eIssn', 'volume',
       'number', 'issueType', 'topicalCollection', 'genre', 'startingPage',
       'endingPage', 'journalId', 'printDate', 'onlineDate', 'coverDate',
       'copyright', 'abstract', 'conferenceInfo', 'keyword', 'subjects',
       'disciplines', 'full', 'topic_id', 'topic_prob', 'topic_name',
       'topic_count', 'topic_terms'],
      dtype='object')

In [89]:
df[df.topic_id != -1].sample(50)[["doi", "url_web", "title", "abstract", "creators", "onlineDate", "publicationName", "keyword", "subjects", "topic_id", "topic_prob", "topic_name", "topic_terms"]].to_excel("../data/interim/cortoviz_evaluation/articles_test.xlsx",header=True, index=False)

In [100]:
df_topics = pd.DataFrame.from_dict(topic_model.get_topics(), orient="index")

In [103]:
df_topics.loc[1:].to_excel("../data/interim/cortoviz_evaluation/topics.xlsx")