In [1]:
# Install libraries
#!pip install rdflib
#!pip install owlrl
# !pip install Faker

In [2]:
import pandas as pd
import numpy as np
import os, gc
import random

#Import Faker
from faker import Faker

#Create faker object
fake = Faker()

# Folder to save all output
savefolder='../output'

# Folder containing all data (csv)
datafolder='../data'

os.chdir(datafolder)
random.seed(123)

# Prep dataframes

In [3]:
def convert_to_dt(df):
    'Converts date columns (inferred from column names) into pandas datetime types'
    for col in [i for i in df.columns if 'date' in i.lower() or i.endswith('dt')]:
        print(f'To date: {col}')
        df[col]=pd.to_datetime(df[col])

def to_camel_case(text):
    """
    Convert string to camel case (no spaces)
    """
    x=[i for i in text]
    return ''.join(sum([],[x[0].upper()]+x[1:]))

In [4]:
# Read authors and merge institution
author=pd.read_csv('authors.csv').merge(pd.read_csv('institutions.csv').rename(columns={'intitutionid':'institutionid',
                                                                                       'name':'institution'}), 
                                        on='institutionid').drop(columns='institutionid').drop(columns=['affiliations','homepage','fake'])
# Add aditional information
author['sex']=pd.Series([random.randint(1, 2) for i in range(len(author))]).map({1:'Female',2:'Male'})
author['birthdate']=[fake.date() for i in range(len(author))]
author['originCountry']=[fake.country() for i in range(len(author))]
# author['paperCount']=author.paperCount.fillna(pd.Series([random.randint(1,100) for i in range(len(author))]))
# author['citationCount']=author.citationCount.fillna(pd.Series([random.randint(1,100) for i in range(len(author))]))
author['hIndex']=author.hIndex.fillna(pd.Series([random.randint(1,10) for i in range(len(author))]))

# # Make sure paper count is less than citation count, otherwise exchange values
# m=author.paperCount<author.citationCount
# author.loc[m, ['paperCount', 'citationCount']] = (
#     author.loc[m, ['citationCount', 'paperCount']].values)

author=author.dropna().reset_index(drop=True).rename(columns={'authorId':'author'})
author.drop(columns=['paperCount','citationCount'], inplace=True)

convert_to_dt(author)

author

To date: birthdate


Unnamed: 0,author,url,name,hIndex,institution,sex,birthdate,originCountry
0,7.265495e+06,https://www.semanticscholar.org/author/7265495,James C. Petrovich,9.0,National Taiwan University,Female,1980-04-05,Jordan
1,3.885358e+06,https://www.semanticscholar.org/author/3885358,H. Tsai,22.0,National Taiwan University,Male,2011-01-20,Puerto Rico
2,4.086158e+07,https://www.semanticscholar.org/author/40861575,Mary K. Twis,6.0,Nisho Gakusha University,Female,1975-01-01,Turks and Caicos Islands
3,2.073785e+09,https://www.semanticscholar.org/author/2073784912,S. Evans,3.0,Hatyai University,Male,2022-02-12,Philippines
4,4.666378e+07,https://www.semanticscholar.org/author/46663785,Jae-Ho Lee,5.0,Hatyai University,Male,1990-08-15,Swaziland
...,...,...,...,...,...,...,...,...
3729,1.860603e+06,https://www.semanticscholar.org/author/1860603,S. Ficici,17.0,Girne American University,Male,2020-12-14,Russian Federation
3730,1.446952e+08,https://www.semanticscholar.org/author/144695234,C. King,23.0,Hochschule fÃ¼r Technik,Female,1999-07-12,Kazakhstan
3731,1.820146e+06,https://www.semanticscholar.org/author/1820146,Tina Tallon,1.0,Winona State University,Female,2019-02-08,Uganda
3732,2.818707e+06,https://www.semanticscholar.org/author/2818707,M. Lagomarsino,22.0,Universidade Estadual Paulista,Male,1989-03-15,Botswana


In [5]:
# Read paper
paper=pd.read_csv('paper.csv').drop(columns=['sha','fake']).rename(columns={'id':'paper'})

# Synthesize new fields
paper['wordcount']=[random.randint(4000,7000) for i in range(len(paper))]
paper['abstract']=paper.abstract.fillna(pd.Series([fake.paragraph() for i in range(len(paper))]))
paper['type']=[random.sample(['short','demo','full','poster'], 1)[0] for i in range(len(paper))]
paper['doi']=[f'http://doi.org/{fake.iana_id()}/{fake.ipv4()}' for i in range(len(paper))]
paper.drop(columns=['url'], inplace=True)

# Merge paper information with conference and journal publication match
paper=pd.concat(
    [
        (pd.read_csv('submitted_to_conference.csv').merge(pd.read_csv('holds.csv').drop(columns=['fake']), on='edition')
         .drop(columns=['fake'])),
        (pd.read_csv('submitted_to_journal.csv').merge(pd.read_csv('volume_of.csv').drop(columns=['fake']), on='volume')
         .drop(columns=['fake']))
    ]
).drop_duplicates().merge(paper, on='paper')

# Fill in null dates
dts=pd.Series([fake.date() for i in range(len(paper))])
paper['published_date']=paper['published_date'].fillna(dts)
paper['submitted_date']=paper['submitted_date'].fillna(dts)

# Unify columns
paper['venue_type']=np.where(paper.conference.notna(), 'Conference', 'Journal')
paper['venue']=paper.conference.fillna(paper.journal)
paper['publication']=paper.edition.fillna(paper.volume)
paper.drop(columns=['edition','conference','volume','journal'], inplace=True)

# Since editions will now be conference conepts, all papers submitted to a conference will use the id of its proceeding as id for the conference
m=paper.venue_type=='Conference'
paper.loc[m, ['venue']] = (
    paper.loc[m, ['publication']].values)

convert_to_dt(paper)

#### PAPER CONSTRAINTS

# Submission date is less then published date
m=paper.submitted_date>paper.published_date
paper.loc[m, ['published_date', 'submitted_date']] = (
    paper.loc[m, ['submitted_date', 'published_date']].values)

# Poster can only be in conference. if not conference, change type
paper.loc[(paper.type=='poster')&(paper.venue_type=='Journal'),
          'type']=pd.Series([random.sample(['short','demo','full'], 1)[0] for i in range(len(paper))])

# Infer publication date from paper published dates
published=paper.groupby(['venue_type','venue','publication']).agg({'published_date':max,'submitted_date':min}).reset_index()
published['published_date']=published[['published_date','submitted_date']].max(axis=1)
published.drop(columns=['submitted_date'], inplace=True)
submitted=paper[['paper','submitted_date','venue_type','venue','publication']].copy()

# Get decision per paper
decision=pd.read_csv('reviews.csv').groupby('paper').agg({'decision':['sum','count']})
decision=((decision.iloc[:,0]/decision.iloc[:,1])>0.5).to_dict()
paper['decision']=paper.paper.map(decision)

# Delete values for non-approved papers based on review decisions
for col in ['published_date','publication','doi']:
    print(col)
    paper.loc[(paper.decision==False)&(paper[col].notna()),[col]]=np.nan

paper.drop(columns=['published_date'], inplace=True)

# Create submission id -- note: submission and paper has a one to one relationship, as stated in the assumptions
paper['submission']='sub-'+paper.paper.astype(int).astype(str)

paper

To date: published_date
To date: submitted_date
published_date
publication
doi


Unnamed: 0,paper,submitted_date,title,abstract,wordcount,type,doi,venue_type,venue,publication,decision,submission
0,2178047,2002-10-07,Applying External Solutions to Organizational ...,Exactly in say this special thank politics.,4170,full,http://doi.org/11736/194.136.172.193,Conference,976927f9-0db0-4946-925d-d113880b67d9-2002,976927f9-0db0-4946-925d-d113880b67d9-2002,True,sub-2178047
1,46711191,2013-09-01,A loop based approach to analytical multi-core...,This paper presents a loop based formulation f...,6931,full,http://doi.org/2511816/104.242.39.30,Conference,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,True,sub-46711191
2,14154659,2012-03-25,Relay selection in multi-user amplify-forward ...,For multi-user (MU) amplify-and-forward (AF) c...,4562,short,http://doi.org/1438288/138.213.79.54,Conference,0d6f7fba-7092-46b3-8039-93458dba736b-2012,0d6f7fba-7092-46b3-8039-93458dba736b-2012,True,sub-14154659
3,14119063,2011-06-10,ID-based proxy re-signcryption scheme,"Combining the idea of signcryption, a proxy re...",6494,full,http://doi.org/8845231/186.177.184.2,Conference,047958df-6384-459e-9864-63f946419551-2011,047958df-6384-459e-9864-63f946419551-2011,True,sub-14119063
4,45069334,2011-07-23,Autonomous control of running takeoff and land...,Arm southern school indicate. Ever if son draw.,6213,demo,http://doi.org/3569879/20.142.47.241,Conference,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,True,sub-45069334
...,...,...,...,...,...,...,...,...,...,...,...,...
4325,257395327,2002-01-01,Institutions. The important forecasts because ...,"Calls this a shower, being heavier than hydrog...",6743,full,http://doi.org/5494054/101.0.101.219,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395327
4326,257395328,2002-01-01,Parent plant and prompted the revolution that ...,Naval dockyards röntgen discovered. With prey ...,6227,short,http://doi.org/2228443/153.187.19.214,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395328
4327,257395329,2020-01-01,Many intersections appellation eclipsed and ev...,Mariners and limited-convective patches. Parts...,4495,demo,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395329
4328,257395330,2020-01-01,"And downtown, to Aswan and is now landfilled, ...","Don; williams, hills, mountains.. Learning dis...",4911,short,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395330


In [6]:
# Read reviews
review=pd.read_csv('reviews.csv').rename(columns={'reviewerid':'reviewer'})

# Get dates
review=(submitted
 .merge(published, on=['venue_type','venue','publication'], how='outer')
 .merge(review, on=['paper'], how='right')
)
review['reviewDate']=[fake.date_between_dates(j['submitted_date'], j['published_date']) for i,j in review.iterrows()]
review.drop(columns=['venue_type','venue','publication','submitted_date','published_date'], inplace=True)
review['review']=review['paper'].astype(str)+'-'+review['reviewer'].astype(str)

#create submission id instead of paper id 
review['submission']='sub-'+review.paper.astype(int).astype(str)
review.drop(columns=['paper'], inplace=True)

review

Unnamed: 0,reviewer,decision,content,reviewDate,review,submission
0,52331035,1,"Ed. 2009), entire landmass of some kind of mat...",2020-07-01,225065627-52331035,sub-225065627
1,73771487,0,"Fir trees contracting party), traveling in any...",2020-07-01,225065627-73771487,sub-225065627
2,34202459,1,"Louis xiv, russia also concluded alliances tha...",2020-07-01,225065627-34202459,sub-225065627
3,15729050,0,Subsets r. journals request. Mainland north an...,2021-03-27,232355224-15729050,sub-232355224
4,15674973,1,Eastern egyptian are heard. Polar origin monso...,2021-03-28,232355224-15674973,sub-232355224
...,...,...,...,...,...,...
12985,49184152,1,"Masaryk, memorials in elevation.. First countr...",2020-11-19,257395330-49184152,sub-257395330
12986,120634484,0,"City"", the cloud. youtube. Predictions (reason...",2020-12-10,257395330-120634484,sub-257395330
12987,1799398854,1,Commonwealth since government freed. Rail netw...,2020-02-28,257395331-1799398854,sub-257395331
12988,1398510823,0,And sidewalks. mestizo (mixed) of south centra...,2020-01-21,257395331-1398510823,sub-257395331


In [7]:
# Note: using edition as conference title
conference=(pd.read_csv('conference.csv').rename(columns={'id':'conference'})
            .merge(pd.read_csv('holds.csv').drop(columns=['fake']), on=['conference'])
           .merge(pd.read_csv('edition.csv').rename(columns={'id':'edition'}).drop(columns=['fake','conference']), on='edition')
           .rename(columns={'venue':'location'})
            .drop(columns=['url'])
            .rename(columns={'edition':'title','name':'conferenceSeries'})
           .drop_duplicates()
           )
conference['conference']=conference['proceeding'].copy()
conference['title']=conference['year'].astype(str) + ' ' + conference['conferenceSeries']
conference['type']=[random.sample(['workshop', 'symposium', 'expert group','regular'], 1)[0] for i in range(len(conference))]

# SENSE CHECK: Check for conference series with more than one conference -- there is one series with 2 conferences
conference[conference.duplicated(subset=['conferenceSeries'], keep=False)]

# generate more fake fields
conference['issn']=conference.issn.fillna(pd.Series([fake.ssn() for i in range(len(conference))]))
conference['publisher']=[fake.company() for i in range(len(conference))]

# Get published date
conference=(conference.merge(published[published.venue_type=='Conference']
                  .rename(columns={'venue':'conference','publication':'proceeding'}))
            .drop(columns=['venue_type'])
)

# If organizer is not an author, make null then order
conference.loc[~conference.chairperson.isin(author.author),'chairperson']=np.nan
conference.sort_values(['conference','title','chairperson'], inplace=True)

# Separate conference and proceeding: note that there is a one to one correspondence for them
cols=['title','chairperson','location','Start','End','year','conferenceSeries','type']
proceeding=conference.copy()
conference=conference[['conference']+cols].drop_duplicates().reset_index(drop=True).rename(columns={'chairperson':'organizer'})
proceeding.drop(columns=cols, inplace=True)

display(conference, proceeding)

Unnamed: 0,conference,title,organizer,location,Start,End,year,conferenceSeries,type
0,047958df-6384-459e-9864-63f946419551-2011,2011 International Conference on Computer Scie...,,"Federal, Entre Rios, Argentina",2011-06-10,2011-06-10,2011,International Conference on Computer Science a...,regular
1,090b9c90-a659-4e83-90ee-b8d3a161b54c-2020,2020 Research in Equitable and Sustained Parti...,,"Bouna, Zanzan, Ivory Coast",2020-03-10,2020-03-10,2020,Research in Equitable and Sustained Participat...,workshop
2,0abc814b-4b33-4afd-bb15-61351136028f-2014,2014 Mobile und Ubiquitäre Informationssysteme,2.160108e+09,"Kėdainiai, Kauno apskritis, Lithuania",2014-12-01,2014-12-01,2014,Mobile und Ubiquitäre Informationssysteme,symposium
3,0cb0dcea-43f4-4a34-a1c3-ac211434f660-2018,2018 International Conference Computing Method...,4.794920e+07,"Tabaco, Bicol, Philippines",2018-02-01,2018-02-01,2018,International Conference Computing Methodologi...,symposium
4,0d6f7fba-7092-46b3-8039-93458dba736b-2012,2012 IEEE International Conference on Acoustic...,1.410752e+09,"Goianésia, Goiás, Brazil",2012-03-25,2012-03-25,2012,"IEEE International Conference on Acoustics, Sp...",regular
...,...,...,...,...,...,...,...,...,...
61,df9f7819-abf5-46f8-b6a8-6bd3261a21a5-2011,2011 Australasian Telecommunication Networks a...,2.072520e+09,"Zavolzh’ye, Nizjnij Novgorod, Russia",2011-12-08,2011-12-08,2011,Australasian Telecommunication Networks and Ap...,expert group
62,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,2013 Australasian Universities Power Engineeri...,2.132914e+06,"Goiânia, Goiás, Brazil",2013-09-01,2013-09-01,2013,Australasian Universities Power Engineering Co...,workshop
63,e3b1672f-7753-4ca1-8a8b-586d2b4392ba-2020,2020 IEEE International Conference on High Vol...,,"Tandahimba, Mtwara, Tanzania",2020-09-06,2020-09-06,2020,IEEE International Conference on High Voltage ...,symposium
64,e84bb5a1-8f79-42cc-8eb1-3a52f7c73d63-2022,"2022 IEEE International Conference on Systems,...",,"Somerset East, Eastern Cape, South Africa",2022-10-09,2022-10-09,2022,"IEEE International Conference on Systems, Man ...",regular


Unnamed: 0,conference,proceeding,issn,publisher,published_date
3,047958df-6384-459e-9864-63f946419551-2011,047958df-6384-459e-9864-63f946419551-2011,439-34-0669,Washington-Jacobson,2011-06-10
22,090b9c90-a659-4e83-90ee-b8d3a161b54c-2020,090b9c90-a659-4e83-90ee-b8d3a161b54c-2020,808-03-4528,Mack Inc,2020-03-10
52,0abc814b-4b33-4afd-bb15-61351136028f-2014,0abc814b-4b33-4afd-bb15-61351136028f-2014,,"Bennett, Harrell and Wright",2014-12-01
21,0cb0dcea-43f4-4a34-a1c3-ac211434f660-2018,0cb0dcea-43f4-4a34-a1c3-ac211434f660-2018,791-75-6509,"Hamilton, Fox and Porter",2018-02-01
2,0d6f7fba-7092-46b3-8039-93458dba736b-2012,0d6f7fba-7092-46b3-8039-93458dba736b-2012,063-18-7846,Hall-Lewis,2012-03-25
...,...,...,...,...,...
61,df9f7819-abf5-46f8-b6a8-6bd3261a21a5-2011,df9f7819-abf5-46f8-b6a8-6bd3261a21a5-2011,,"Carrillo, Cohen and Miller",2011-12-08
1,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,183-24-1634,Mathis LLC,2013-09-01
37,e3b1672f-7753-4ca1-8a8b-586d2b4392ba-2020,e3b1672f-7753-4ca1-8a8b-586d2b4392ba-2020,,"Schneider, Crawford and Zamora",2020-09-06
56,e84bb5a1-8f79-42cc-8eb1-3a52f7c73d63-2022,e84bb5a1-8f79-42cc-8eb1-3a52f7c73d63-2022,,"Fernandez, Myers and Johnson",2022-10-09


In [8]:
# Note: using volume id as volume as proceeding name
journal=(pd.read_csv('journal.csv').rename(columns={'id':'journal'})
            .merge(pd.read_csv('volume_of.csv').drop(columns=['fake']), on=['journal'])
           .merge(pd.read_csv('volume.csv').drop(columns=['volume']).rename(columns={'id':'volume'}).drop(columns=['fake']), on='volume')
           .drop_duplicates()
         .drop(columns=['url'])
         .rename(columns={'name':'title'})
        )
# Fill in null titles
titles=journal[['journal','title']].drop_duplicates().reset_index(drop=True)
titles.loc[titles.title.isna(), 'title']=pd.Series([i for i in 'Journal of '+fake.word() for i in range(len(titles))])
journal['title']=journal.journal.map(titles.set_index('journal').title.to_dict())
del titles

# Get published date
journal=(journal.merge(published[published.venue_type=='Journal']
                  .rename(columns={'venue':'journal','publication':'volume'}))
            .drop(columns=['venue_type'])
)

# generate more fake fields
journal['issn']=journal.issn.fillna(pd.Series([fake.ssn() for i in range(len(journal))]))
journal['publisher']=[fake.company() for i in range(len(journal))]
journal.drop(columns=['year'], inplace=True)

# If organizer is not an author, make null then order
journal.loc[~journal.editor.isin(author.author),'editor']=np.nan
journal.sort_values(['journal','title','editor'], inplace=True)

# Separate journal and volume
cols=['title','editor']
volume=journal.copy()
journal=journal[['journal']+cols].groupby(['journal','title']).head(1).reset_index(drop=True).rename(columns={'editor':'organizer'})
volume.drop(columns=cols, inplace=True)

display(journal, volume)

Unnamed: 0,journal,title,organizer
0,011a90c7-7490-45ee-ae3e-63c72d01a408,"Auris, nasus, larynx",2.879026e+06
1,011cf95e-1c33-4d8e-84b9-1dd5c95629f6,"Biofactors (Oxford, England)",8.108943e+06
2,0199040d-80b8-461d-b4d3-66056764c7bc,Die Pharmazie,2.770825e+06
3,01c7a86f-47d6-4459-8052-37d94f99a653,Biomedical and Environmental Sciences,5.641527e+06
4,01c93b25-c79d-47c7-8eb6-79ac1e1d685f,Intelligent Automation and Soft Computing,1.451339e+08
...,...,...,...
601,fcf68436-815b-41ff-9912-c435ac687663,Journal of Clinical Psychology in Medical Sett...,4.014742e+07
602,fdeb07d1-a046-4a1a-98aa-167e7f8e667a,Frontiers in Oncology,2.191659e+09
603,ff4eb8fa-170e-4d36-99eb-f00d794ec6e4,Social Service Review,3.155759e+06
604,ffaa4409-29fb-4245-a496-c51b151f9f5f,Journal of pediatric gastroenterology and nutr...,2.926423e+07


Unnamed: 0,journal,issn,volume,published_date,publisher
1207,011a90c7-7490-45ee-ae3e-63c72d01a408,0385-8146,011a90c7-7490-45ee-ae3e-63c72d01a408-2009,2009-12-31,Poole LLC
1206,011a90c7-7490-45ee-ae3e-63c72d01a408,0385-8146,011a90c7-7490-45ee-ae3e-63c72d01a408-36 3,2009-06-01,Lin Ltd
1396,011cf95e-1c33-4d8e-84b9-1dd5c95629f6,0951-6433,011cf95e-1c33-4d8e-84b9-1dd5c95629f6-2022,2022-12-31,Jackson Group
1395,011cf95e-1c33-4d8e-84b9-1dd5c95629f6,0951-6433,011cf95e-1c33-4d8e-84b9-1dd5c95629f6-48,2022-06-20,Petty LLC
456,0199040d-80b8-461d-b4d3-66056764c7bc,0031-7144,0199040d-80b8-461d-b4d3-66056764c7bc-1972,1972-12-31,Church-Garcia
...,...,...,...,...,...
658,ff4eb8fa-170e-4d36-99eb-f00d794ec6e4,0037-7961,ff4eb8fa-170e-4d36-99eb-f00d794ec6e4-39,1965-12-01,"Kelly, Hale and Johnson"
1022,ffaa4409-29fb-4245-a496-c51b151f9f5f,0277-2116,ffaa4409-29fb-4245-a496-c51b151f9f5f-35 5,2002-11-01,Cox LLC
1023,ffaa4409-29fb-4245-a496-c51b151f9f5f,0277-2116,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,2002-12-31,Schmitt-Meadows
607,fff3549c-df24-4aef-accb-a33ae442a828,2077-1312,fff3549c-df24-4aef-accb-a33ae442a828-2020,2020-12-31,Bender Group


In [9]:
hasAuthor=pd.read_csv('writes.csv').drop(columns=['fake'])
hasAuthor=hasAuthor.dropna().reset_index(drop=True)
hasAuthor=hasAuthor[hasAuthor.paper.isin(paper.paper)]
hasAuthor

Unnamed: 0,author,paper
3,6880566.0,225065627
7,15286113.0,232355224
8,6539599.0,24987797
9,11389934.0,24987797
10,14448741.0,24987797
...,...,...
13568,2151717.0,257395330
13569,49902943.0,257395331
13570,2470346.0,257395331
13571,46271051.0,257395331


In [10]:
# Fill null organizers in journal and conference - make sure they are not authors in the same conference edition/journal
a_list=set(author.author.unique())
for df in ['conference','journal']:
    print(df)
    nulls=globals()[df][globals()[df].organizer.isna()]
    for i in nulls[df]:
        a=set(paper.loc[(paper.venue_type==to_camel_case(df))&(paper.venue==i),
                      ['paper']].merge(hasAuthor, on='paper').author.unique())
        globals()[df].loc[globals()[df][df]==i,'organizer']=np.random.choice(list(a_list-a))
print(conference.organizer.isna().sum(), journal.organizer.isna().sum())

del nulls, a_list

conference
journal
0 0


In [11]:
# Edit Paper/submission
# Get information about the chair/editor that assigned reviewers for that submission
org=pd.concat([conference[['conference','organizer']].assign(venue_type='Conference').rename(columns={'conference':'venue'}),
           journal[['journal','organizer']].assign(venue_type='Journal').rename(columns={'journal':'venue'})]).drop_duplicates()

paper=paper.merge(org, on=['venue_type','venue'], how='left')
paper

Unnamed: 0,paper,submitted_date,title,abstract,wordcount,type,doi,venue_type,venue,publication,decision,submission,organizer
0,2178047,2002-10-07,Applying External Solutions to Organizational ...,Exactly in say this special thank politics.,4170,full,http://doi.org/11736/194.136.172.193,Conference,976927f9-0db0-4946-925d-d113880b67d9-2002,976927f9-0db0-4946-925d-d113880b67d9-2002,True,sub-2178047,3.546367e+07
1,46711191,2013-09-01,A loop based approach to analytical multi-core...,This paper presents a loop based formulation f...,6931,full,http://doi.org/2511816/104.242.39.30,Conference,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,True,sub-46711191,2.132914e+06
2,14154659,2012-03-25,Relay selection in multi-user amplify-forward ...,For multi-user (MU) amplify-and-forward (AF) c...,4562,short,http://doi.org/1438288/138.213.79.54,Conference,0d6f7fba-7092-46b3-8039-93458dba736b-2012,0d6f7fba-7092-46b3-8039-93458dba736b-2012,True,sub-14154659,1.410752e+09
3,14119063,2011-06-10,ID-based proxy re-signcryption scheme,"Combining the idea of signcryption, a proxy re...",6494,full,http://doi.org/8845231/186.177.184.2,Conference,047958df-6384-459e-9864-63f946419551-2011,047958df-6384-459e-9864-63f946419551-2011,True,sub-14119063,1.010559e+07
4,45069334,2011-07-23,Autonomous control of running takeoff and land...,Arm southern school indicate. Ever if son draw.,6213,demo,http://doi.org/3569879/20.142.47.241,Conference,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,True,sub-45069334,2.108384e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,257395327,2002-01-01,Institutions. The important forecasts because ...,"Calls this a shower, being heavier than hydrog...",6743,full,http://doi.org/5494054/101.0.101.219,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395327,2.926423e+07
4326,257395328,2002-01-01,Parent plant and prompted the revolution that ...,Naval dockyards röntgen discovered. With prey ...,6227,short,http://doi.org/2228443/153.187.19.214,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395328,2.926423e+07
4327,257395329,2020-01-01,Many intersections appellation eclipsed and ev...,Mariners and limited-convective patches. Parts...,4495,demo,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395329,1.402452e+09
4328,257395330,2020-01-01,"And downtown, to Aswan and is now landfilled, ...","Don; williams, hills, mountains.. Learning dis...",4911,short,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395330,1.402452e+09


In [12]:
# Convert selected id columns to int64
df_list= %who_ls DataFrame
for df in df_list:
    cols=[i for i in globals()[df].columns if i in ['author', 'organizer', 'paper', 'reviewer']]
    if len(cols)>0:
        print('\n\n=======',df,'=======')
        for col in cols:
            print(col)
            globals()[df][col]=(globals()[df][col]).astype('int64')



author


organizer


author
paper


organizer


organizer


paper
organizer


reviewer


paper


In [13]:
area=pd.read_csv('topic.csv', usecols=['community']).rename(columns={'community':'topicName'}).drop_duplicates().reset_index(drop=True)
area['area']='area-'+area.index.astype(str)
area

Unnamed: 0,topicName,area
0,Pure Science,area-0
1,Applied Science,area-1
2,Social Science,area-2
3,Database,area-3


In [14]:
hasTopic=[]
for df in ['paper','journal','volume','conference','proceeding']:
    hasTopic.append(globals()[df][[df]].rename(columns={df:'id'}).assign(typ=df))
hasTopic=pd.concat(hasTopic, ignore_index=True)
hasTopic['area']=[random.sample(list(area.area.unique()), 1)[0] for i in range(len(hasTopic))]
hasTopic=hasTopic.merge(area, on=['area'])
hasTopic

Unnamed: 0,id,typ,area,topicName
0,2178047,paper,area-2,Social Science
1,14154659,paper,area-2,Social Science
2,227122767,paper,area-2,Social Science
3,8909945,paper,area-2,Social Science
4,30959469,paper,area-2,Social Science
...,...,...,...,...
6517,63d1595c-87bf-4fd8-be2e-327f299cb9ea-2012,proceeding,area-0,Pure Science
6518,66c268f7-ba60-443f-919d-e2c297910401-2016,proceeding,area-0,Pure Science
6519,72a6d50c-86ae-47c7-9a0e-54e5746aacee-2020,proceeding,area-0,Pure Science
6520,7431ff67-91dc-41fa-b322-1b1ca657025f-2022,proceeding,area-0,Pure Science


In [15]:
del org, published, submitted
gc.collect()

0

# Define Graph

In [16]:
# Set up graph
from rdflib import Graph, Namespace, URIRef, Literal, RDF, XSD, FOAF, RDFS

g = Graph()
sdm = Namespace('http://example.org/sdm#')

g.bind("sdm", sdm)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("rdf", RDF)

NS = {
    'sdm': sdm,
    'rdf': RDF,
    'rdfs': RDFS,
    'xsd':XSD,
}

# Define ABOX

In [17]:
import re
from datetime import datetime

# Helper functions
def prepareValue(row, uri=sdm):
    """
    Function that prepares the values to be added to the graph as a URI or Literal
    source: https://wiki.uib.no/info216/index.php/Python_Examples#RDF_programming_with_RDFlib_.28Lab_2.29
    Input: row value 
    Output: Converted URI or literal
    """
    if row == None:  # none type
        value = Literal(row)
    elif (isinstance(row, str) and re.match(r'\d{4}-\d{2}-\d{2}', row)) or isinstance(row, datetime):  # date
        value = Literal(row, datatype=XSD.date)
    elif isinstance(row, bool):  # boolean value (true / false)
        value = Literal(row, datatype=XSD.boolean)
    elif isinstance(row, int):  # integer
        value = Literal(row, datatype=XSD.integer)
    elif isinstance(row, str):  # string
#         # Use this if saving as URI
#         pattern='^((http|https)://)[-a-zA-Z0-9@:%._\\+~#?&//=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%._\\+~#?&//=]*)$'
#         value=URIRef(uri+re.sub(pattern,'_',row.replace('\n','_').replace('\t','_').replace(",", "").replace("-", "_").replace('"', '').replace('\\', '')).replace(" ",'_') )
        # if normal string
        value = Literal(row, datatype=XSD.string)
    elif isinstance(row, float):  # float
        value = Literal(row, datatype=XSD.float)
    return value

In [18]:
# General cLean up of all dfs
df_list= %who_ls DataFrame
for df in df_list:
    print(df)
    # replace nulls with None
    globals()[df]=globals()[df].replace(np.nan, None)
    
    # Make all date columns into datetime
    dcols=[i for i in globals()[df].columns if 'date' in i.lower() or i.endswith('_dt')]
    for col in dcols:
        globals()[df][col]=pd.to_datetime(globals()[df][col])

area
author
conference
hasAuthor
hasTopic
journal
paper
proceeding
review
volume


In [19]:
# Convert the non-semantic CSV dataset into a semantic RDF
def area_to_rdf(df):
    """
    Concepts: Area
    """
    for index, row in df.iterrows():
        id = URIRef(sdm + "Area_" + str(row['area']))
        name = prepareValue(row["topicName"])
        
        # Adds the triples
#         g.add((id, RDF.type, sdm.Area))
        g.add((id, sdm.hasTopicName, name))
        
    print('Done: Area')
        
def author_to_rdf(df):
    """
    Concepts: Person, Author
    """
    for index, row in df.iterrows():
        # define values
        #id = URIRef(sdm + "Person_" + str(row['author']))
        id = URIRef(sdm + "Author_" + str(row['author']))
        name = prepareValue(row["name"])
        birthdate = prepareValue(row["birthdate"])
        sex = prepareValue(row["sex"])
        country = prepareValue(row["originCountry"])
        
        # Adds the triples
#         g.add((id, RDF.type, sdm.Author))
        g.add((id, sdm.hasPersonName, name))
        g.add((id, sdm.hasBirthDate, birthdate))
        g.add((id, sdm.hasSex, sex))
        g.add((id, sdm.originCountry, country))
        
        # Author
        #id = URIRef(sdm + "Author_" + str(row['author']))
        url = prepareValue(row["url"])
        hindex = prepareValue(row["hIndex"])
        institution = prepareValue(row["institution"])
                
        # Adds the triples
        g.add((id, sdm.url, url))
        g.add((id, sdm.hasHIndex, hindex))
        g.add((id, sdm.affiliatedWithInstitution, institution))
    print('Done: Author')

def conference_to_rdf(df):
    """
    Concepts: Conference
    Relationships: hasOrganizer
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Conference_" + str(row['conference']))
        conf_type={'expert group':sdm.ExpertGroup, 'symposium':sdm.Symposium, 
                   'workshop':sdm.Workshop, 'regular':sdm.RegularConference}[row['type']]
        title = prepareValue(row["title"])
        location = prepareValue(row["location"])
        start = prepareValue(row["Start"])
        end = prepareValue(row["End"])
        year = prepareValue(row["year"])
        conferenceSeries = prepareValue(row["conferenceSeries"])
        
        # Adds the triples 
        g.add((id, RDF.type, conf_type))
        g.add((id, sdm.hasVenueTitle, title))
        g.add((id, sdm.heldIn, location))
        g.add((id, sdm.startDate, start))
        g.add((id, sdm.endDate, end))
        g.add((id, sdm.heldInYear, year))
        g.add((id, sdm.conferenceSeries, conferenceSeries))
        
        # Relationships
        author_org=URIRef(sdm + "Author_" + str(row['organizer']))
        #author_org=URIRef(sdm + str(row['organizer']))
        
        # Adds the triples
        g.add((author_org, RDF.type, sdm.Chair))
        g.add((id, sdm.hasOrganizer, author_org))
    print('Done: Conference')
        
def journal_to_rdf(df):
    """
    Concepts: Journal
    Relationships: hasOrganizer
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Journal_" + str(row['journal']))
        title = prepareValue(row["title"])
        
        # Adds the triples 
        g.add((id, RDF.type, sdm.Journal))
        g.add((id, sdm.hasVenueTitle, title))
        
        # Relationships
        author_org=URIRef(sdm + "Author_" + str(row['organizer']))
        #author_org=URIRef(sdm + str(row['organizer']))

        
        # Adds the triples
        g.add((author_org, RDF.type, sdm.Editor))
        g.add((id, sdm.hasOrganizer, author_org))
    print('Done: Journal')

def volume_to_rdf(df):
    """
    Concepts: Volume
    Relationships: hasPublished
    NOTE: Used Venue > Publication relationship. note that Volume URIs replace spaces with _
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Volume_" + str(row['volume']).replace(' ','_'))
        jid= URIRef(sdm + "Journal_" + str(row['journal']))
        issn = prepareValue(row["issn"])
        published_date = prepareValue(row["published_date"])
        publisher = prepareValue(row["publisher"])
        
        # Adds the triples
        g.add((id, RDF.type, sdm.Volume))
        g.add((id, sdm.publicationIssn, issn))
        g.add((id, sdm.publishedDate, published_date))
        g.add((id, sdm.publisher, publisher))
        
        # Relationship
        g.add((jid, sdm.hasPublished, id))
    print('Done: Volume')

def proceeding_to_rdf(df):
    """
    Concepts: proceeding
    Relationships: hasPublished
    NOTE: Used Venue > Publication relationship
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Proceeding_" + str(row['proceeding']))
        cid= URIRef(sdm + "Conference_" + str(row['conference']))
        issn = prepareValue(row["issn"])
        published_date = prepareValue(row["published_date"])
        publisher = prepareValue(row["publisher"])
        
        # Adds the triples
        g.add((id, RDF.type, sdm.Proceeding)) 
        g.add((id, sdm.publicationIssn, issn))
        g.add((id, sdm.publishedDate, published_date))
        g.add((id, sdm.publisher, publisher))
        
        # Relationship
        g.add((cid, sdm.hasPublished, id))
    print('Done: Proceeding')

def paper_to_rdf(df):
    """
    Concepts: Paper, Submission
    Relationships: includedIn, publishedIn, assignedBy, submittedTo
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Paper_" + str(row['paper']))
        sid = URIRef(sdm + "Submission_" + str(row['submission']))
        oid=URIRef(sdm + 'Author_' + str(row['organizer']))
        vid=URIRef(sdm + row['venue_type']+'_' + str(row['venue']))
        paper_type={'demo':sdm.DemoPaper, 'full':sdm.FullPaper, 'short':sdm.ShortPaper, 'poster':sdm.Poster}[row['type']]
        
        for col in df.columns:
            locals()[col]=prepareValue(row[col])
            
        # Paper properties
        g.add((id,RDF.type, paper_type))
        g.add((id,sdm.paperAbstract, locals()['abstract']))
        g.add((id,sdm.paperTitle, locals()['title']))
        g.add((id,sdm.paperWordCount, locals()['wordcount']))

        # Submission properties
        g.add((sid,sdm.submissionDate, locals()['submitted_date']))

        # Relationships
        g.add((id,sdm.includedIn,sid))
        g.add((sid, sdm.assignedBy, oid))
        g.add((sid, sdm.submittedTo, vid))
        
        # Conditional property and relationship, only add if paper decision is true (published)
        if row['decision']:
            pid=URIRef(sdm + {'Conference':'Proceeding_','Journal':'Volume_'}[row['venue_type']] + str(row['publication']).replace(' ','_'))
            g.add((id,sdm.paperDOI, locals()['doi']))
            if row['type']!='poster':
                g.add((id,sdm.publishedIn,pid))
            else:
                g.add((id,sdm.posterPublishedIn,pid))
    print('Done: Paper')

def review_to_rdf(df):
    """
    Concepts: Review
    Relationships: hasReviewer, hasReview
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Review_" + str(row['review']))
        sid = URIRef(sdm + "Submission_" + str(row['submission']))
        rid=URIRef(sdm + 'Author_' + str(row['reviewer']))
        #rid=URIRef(sdm + str(row['reviewer']))
        
        for col in df.columns:
            locals()[col]=prepareValue(row[col])
            
        # Paper properties
        g.add((id,sdm.decision, locals()['decision']))
        g.add((id,sdm.content, locals()['content']))
        g.add((id,sdm.reviewDate, locals()['reviewDate']))

        # Relationships
        g.add((id,sdm.hasReviewer,rid))
        g.add((sid, sdm.hasReview, id))
    print('Done: Review')
        
def hasauthor_to_rdf(df):
    """
    Relationships: hasAuthor
    """
    for index, row in df.iterrows():
        # define values
        pid = URIRef(sdm + "Paper_" + str(row['paper']))
        aid = URIRef(sdm + "Author_" + str(row['author']))
        #aid = URIRef(sdm + str(row['author']))

        # Relationships
        g.add((pid,sdm.hasAuthor,aid))
    print('Done: hasAuthor')
        
def hastopic_to_rdf(df):
    """
    Relationships: paperRelatedTo, venueRelatedTo, publicationRelatedTo
    """
    for index, row in df.iterrows():
        # define values
        pid = URIRef(sdm + to_camel_case(row['typ'])+'_' + str(row['id']).replace(' ','_'))
        aid = URIRef(sdm + "Area_" + str(row['area']))
        rel={'paper':sdm.paperRelatedTo, 'journal':sdm.venueRelatedTo, 'volume':sdm.publicationRelatedTo, 
             'conference':sdm.venueRelatedTo, 'proceeding':sdm.publicationRelatedTo}[row['typ']]

        # Relationships
        g.add((pid,rel,aid))
        
    print('Done: hasTopic')

In [20]:
area_to_rdf(area)
author_to_rdf(author)
conference_to_rdf(conference)
journal_to_rdf(journal)
volume_to_rdf(volume)
proceeding_to_rdf(proceeding)
paper_to_rdf(paper) 
review_to_rdf(review) 
hasauthor_to_rdf(hasAuthor)
hastopic_to_rdf(hasTopic)

Done: Area
Done: Author
Done: Conference
Done: Journal
Done: Volume
Done: Proceeding
Done: Paper
Done: Review
Done: hasAuthor
Done: hasTopic


In [21]:
# Get total id counts
for df in df_list:
    cols=globals()[df].columns
    if df in cols:
        print(f'{df}: {globals()[df][df].nunique()}')
    if 'type' in cols:
        print(f'{df} types:\n', globals()[df]['type'].value_counts().to_frame(),'\n')
    if 'organizer' in cols:
        print(f'{df} organizer: {globals()[df]["organizer"].nunique()}')
print('Total unique organizers:',pd.concat([conference['organizer'], journal['organizer']]).nunique())

area: 4
author: 3734
conference: 66
conference types:
               type
workshop        20
symposium       18
regular         17
expert group    11 

conference organizer: 61
journal: 606
journal organizer: 547
paper: 4330
paper types:
         type
full    1452
demo    1435
short   1421
poster    22 

paper organizer: 599
proceeding: 66
review: 12990
volume: 1454
Total unique organizers: 599


# Export

In [26]:
os.chdir(savefolder)
g.serialize(destination='output_abox.rdf',format="xml")

<Graph identifier=Nb22aeae41675490f97b71ca2b02ee9f9 (<class 'rdflib.graph.Graph'>)>