In [None]:
# Install libraries
#!pip install rdflib
#!pip install owlrl
# !pip install Faker

In [46]:
import pandas as pd
import numpy as np
import os, gc
import random

#Import Faker
from faker import Faker

#Create faker object
fake = Faker()

# Folder to save all output
savefolder='../output'

# Folder containing all data (csv)
datafolder='../data'

os.chdir(datafolder)

# Prep dataframes

In [47]:
def convert_to_dt(df):
    'Converts date columns (inferred from column names) into pandas datetime types'
    for col in [i for i in df.columns if 'date' in i.lower() or i.endswith('dt')]:
        print(f'To date: {col}')
        df[col]=pd.to_datetime(df[col])

In [48]:
# Read authors and merge institution
author=pd.read_csv('authors.csv').merge(pd.read_csv('institutions.csv').rename(columns={'intitutionid':'institutionid',
                                                                                       'name':'institution'}), 
                                        on='institutionid').drop(columns='institutionid').drop(columns=['affiliations','homepage','fake'])
# Add aditional information
author['sex']=pd.Series([random.randint(1, 2) for i in range(len(author))]).map({1:'Female',2:'Male'})
author['birthdate']=[fake.date() for i in range(len(author))]
author['originCountry']=[fake.country() for i in range(len(author))]
author['paperCount']=author.paperCount.fillna(pd.Series([random.randint(1,100) for i in range(len(author))]))
author['citationCount']=author.citationCount.fillna(pd.Series([random.randint(1,100) for i in range(len(author))]))
author['hIndex']=author.hIndex.fillna(pd.Series([random.randint(1,10) for i in range(len(author))]))

# Make sure paper count is less than citation count, otherwise exchange values
m=author.paperCount<author.citationCount
author.loc[m, ['paperCount', 'citationCount']] = (
    author.loc[m, ['citationCount', 'paperCount']].values)

author=author.dropna().reset_index(drop=True).rename(columns={'authorId':'author'})
author.drop(columns=['paperCount','citationCount'], inplace=True)

convert_to_dt(author)

author

To date: birthdate


Unnamed: 0,author,url,name,hIndex,institution,sex,birthdate,originCountry
0,7.265495e+06,https://www.semanticscholar.org/author/7265495,James C. Petrovich,9.0,National Taiwan University,Male,2006-08-01,Qatar
1,3.885358e+06,https://www.semanticscholar.org/author/3885358,H. Tsai,22.0,National Taiwan University,Male,2018-10-09,Myanmar
2,4.086158e+07,https://www.semanticscholar.org/author/40861575,Mary K. Twis,6.0,Nisho Gakusha University,Male,1992-02-24,Vietnam
3,2.073785e+09,https://www.semanticscholar.org/author/2073784912,S. Evans,3.0,Hatyai University,Female,2023-03-24,Saint Barthelemy
4,4.666378e+07,https://www.semanticscholar.org/author/46663785,Jae-Ho Lee,5.0,Hatyai University,Male,2018-08-30,Saint Kitts and Nevis
...,...,...,...,...,...,...,...,...
4032,8.076094e+08,http://to.us.png,Canymb Duand,25.0,Universidad Privada del Este,Male,2001-07-15,American Samoa
4033,8.229014e+08,http://inch.com/peral/The/In/itentr.by-Brache4...,For Applas,28.0,Universidade Ibirapuera,Male,1994-03-12,United States of America
4034,8.472713e+08,https://alic.win.us/the/and/to/as.asinit-orted...,Tent Se,5.0,Cankaya University,Male,1987-12-19,Portugal
4035,8.560731e+08,http://in.biz/no.Nover18-lanati-coll,Ind Anathe,63.0,Electronics and Automation (Technical Univers...,Female,2017-06-25,Vanuatu


In [49]:
# Read paper
paper=pd.read_csv('paper.csv').drop(columns=['sha','fake']).rename(columns={'id':'paper'})

# Synthesize new fields
paper['wordcount']=[random.randint(4000,7000) for i in range(len(paper))]
paper['abstract']=paper.abstract.fillna(pd.Series([fake.paragraph() for i in range(len(paper))]))
paper['type']=[random.sample(['short','demo','full','poster'], 1)[0] for i in range(len(paper))]
paper['doi']=[f'http://doi.org/{fake.iana_id()}/{fake.ipv4()}' for i in range(len(paper))]
paper.drop(columns=['url'], inplace=True)

# Merge paper information with conference and journal publication match
paper=pd.concat(
    [
        (pd.read_csv('submitted_to_conference.csv').merge(pd.read_csv('holds.csv').drop(columns=['fake']), on='edition')
         .drop(columns=['fake'])),
        (pd.read_csv('submitted_to_journal.csv').merge(pd.read_csv('volume_of.csv').drop(columns=['fake']), on='volume')
         .drop(columns=['fake']))
    ]
).drop_duplicates().merge(paper, on='paper')

# Fill in null dates
dts=pd.Series([fake.date() for i in range(len(paper))])
paper['published_date']=paper['published_date'].fillna(dts)
paper['submitted_date']=paper['submitted_date'].fillna(dts)

# Unify columns
paper['venue_type']=np.where(paper.conference.notna(), 'Conference', 'Journal')
paper['venue']=paper.conference.fillna(paper.journal)
paper['publication']=paper.edition.fillna(paper.volume)
paper.drop(columns=['edition','conference','volume','journal'], inplace=True)

# Since editions will now be conference conepts, all papers submitted to a conference will use the id of its proceeding as id for the conference
m=paper.venue_type=='Conference'
paper.loc[m, ['venue']] = (
    paper.loc[m, ['publication']].values)

convert_to_dt(paper)

#### PAPER CONSTRAINTS

# Submission date is less then published date
m=paper.submitted_date>paper.published_date
paper.loc[m, ['published_date', 'submitted_date']] = (
    paper.loc[m, ['submitted_date', 'published_date']].values)

# Poster can only be in conference. if not conference, change type
paper.loc[(paper.type=='poster')&(paper.venue_type=='Journal'),
          'type']=pd.Series([random.sample(['short','demo','full'], 1)[0] for i in range(len(paper))])

# Infer publication date from paper published dates
published=paper.groupby(['venue_type','venue','publication']).agg({'published_date':max,'submitted_date':min}).reset_index()
published['published_date']=published[['published_date','submitted_date']].max(axis=1)
published.drop(columns=['submitted_date'], inplace=True)
submitted=paper[['paper','submitted_date','venue_type','venue','publication']].copy()

# Get decision per paper
decision=pd.read_csv('reviews.csv').groupby('paper').agg({'decision':['sum','count']})
decision=((decision.iloc[:,0]/decision.iloc[:,1])>0.5).to_dict()
paper['decision']=paper.paper.map(decision)

# Delete values for non-approved papers based on review decisions
for col in ['published_date','publication','doi']:
    print(col)
    paper.loc[(paper.decision==False)&(paper[col].notna()),[col]]=np.nan

paper.drop(columns=['published_date'], inplace=True)

# Create submission id -- note: submission and paper has a one to one relationship, as stated in the assumptions
paper['submission']='sub-'+paper.paper.astype(int).astype(str)

paper

To date: published_date
To date: submitted_date
published_date
publication
doi


Unnamed: 0,paper,submitted_date,title,abstract,wordcount,type,doi,venue_type,venue,publication,decision,submission
0,2178047,2002-10-07,Applying External Solutions to Organizational ...,Inside establish imagine question there garden...,6190,poster,http://doi.org/7482469/94.13.49.12,Conference,976927f9-0db0-4946-925d-d113880b67d9-2002,976927f9-0db0-4946-925d-d113880b67d9-2002,True,sub-2178047
1,46711191,2013-09-01,A loop based approach to analytical multi-core...,This paper presents a loop based formulation f...,4204,short,http://doi.org/3086481/179.8.95.171,Conference,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,True,sub-46711191
2,14154659,2012-03-25,Relay selection in multi-user amplify-forward ...,For multi-user (MU) amplify-and-forward (AF) c...,4089,full,http://doi.org/5227455/205.156.174.3,Conference,0d6f7fba-7092-46b3-8039-93458dba736b-2012,0d6f7fba-7092-46b3-8039-93458dba736b-2012,True,sub-14154659
3,14119063,2011-06-10,ID-based proxy re-signcryption scheme,"Combining the idea of signcryption, a proxy re...",4997,full,http://doi.org/563719/173.16.247.75,Conference,047958df-6384-459e-9864-63f946419551-2011,047958df-6384-459e-9864-63f946419551-2011,True,sub-14119063
4,45069334,1979-04-06,Autonomous control of running takeoff and land...,Behind air development professor. Manager know...,4899,short,http://doi.org/8340312/78.156.213.164,Conference,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,True,sub-45069334
...,...,...,...,...,...,...,...,...,...,...,...,...
4325,257395327,2002-01-01,Institutions. The important forecasts because ...,"Calls this a shower, being heavier than hydrog...",6649,demo,http://doi.org/1616189/169.203.92.2,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395327
4326,257395328,2002-01-01,Parent plant and prompted the revolution that ...,Naval dockyards röntgen discovered. With prey ...,5210,demo,http://doi.org/6586262/93.30.52.186,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395328
4327,257395329,2020-01-01,Many intersections appellation eclipsed and ev...,Mariners and limited-convective patches. Parts...,6951,short,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395329
4328,257395330,2020-01-01,"And downtown, to Aswan and is now landfilled, ...","Don; williams, hills, mountains.. Learning dis...",4206,demo,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395330


In [50]:
# Read reviews
review=pd.read_csv('reviews.csv').rename(columns={'reviewerid':'reviewer'})

# Get dates
review=(submitted
 .merge(published, on=['venue_type','venue','publication'], how='outer')
 .merge(review, on=['paper'], how='right')
)
review['reviewDate']=[fake.date_between_dates(j['submitted_date'], j['published_date']) for i,j in review.iterrows()]
review.drop(columns=['venue_type','venue','publication','submitted_date','published_date'], inplace=True)
review['review']=review['paper'].astype(str)+'-'+review['reviewer'].astype(str)

#create submission id instead of paper id 
review['submission']='sub-'+review.paper.astype(int).astype(str)
review.drop(columns=['paper'], inplace=True)

review

Unnamed: 0,reviewer,decision,content,reviewDate,review,submission
0,52331035,1,"Ed. 2009), entire landmass of some kind of mat...",2020-07-01,225065627-52331035,sub-225065627
1,73771487,0,"Fir trees contracting party), traveling in any...",2020-07-01,225065627-73771487,sub-225065627
2,34202459,1,"Louis xiv, russia also concluded alliances tha...",2020-07-01,225065627-34202459,sub-225065627
3,15729050,0,Subsets r. journals request. Mainland north an...,2021-03-29,232355224-15729050,sub-232355224
4,15674973,1,Eastern egyptian are heard. Polar origin monso...,2021-03-26,232355224-15674973,sub-232355224
...,...,...,...,...,...,...
12985,49184152,1,"Masaryk, memorials in elevation.. First countr...",2020-08-21,257395330-49184152,sub-257395330
12986,120634484,0,"City"", the cloud. youtube. Predictions (reason...",2020-12-21,257395330-120634484,sub-257395330
12987,1799398854,1,Commonwealth since government freed. Rail netw...,2020-10-26,257395331-1799398854,sub-257395331
12988,1398510823,0,And sidewalks. mestizo (mixed) of south centra...,2020-12-14,257395331-1398510823,sub-257395331


In [51]:
# Note: using edition as conference title
conference=(pd.read_csv('conference.csv').rename(columns={'id':'conference'})
            .merge(pd.read_csv('holds.csv').drop(columns=['fake']), on=['conference'])
           .merge(pd.read_csv('edition.csv').rename(columns={'id':'edition'}).drop(columns=['fake','conference']), on='edition')
           .rename(columns={'venue':'location'})
            .drop(columns=['url'])
            .rename(columns={'edition':'title','name':'conferenceSeries'})
           .drop_duplicates()
           )
conference['conference']=conference['proceeding'].copy()
conference['title']=conference['year'].astype(str) + ' ' + conference['conferenceSeries']
conference['type']=[random.sample(['workshop', 'symposium', 'expert group','regular'], 1)[0] for i in range(len(conference))]

# SENSE CHECK: Check for conference series with more than one conference -- there is one series with 2 conferences
conference[conference.duplicated(subset=['conferenceSeries'], keep=False)]

# generate more fake fields
conference['issn']=conference.issn.fillna(pd.Series([fake.ssn() for i in range(len(conference))]))
conference['publisher']=[fake.company() for i in range(len(conference))]

# Get published date
conference=(conference.merge(published[published.venue_type=='Conference']
                  .rename(columns={'venue':'conference','publication':'proceeding'}))
            .drop(columns=['venue_type'])
)

# Separate conference and proceeding: note that there is a one to one correspondence for them
cols=['title','chairperson','location','Start','End','year','conferenceSeries','type']
proceeding=conference.copy()
conference=conference[['conference']+cols].drop_duplicates().reset_index(drop=True).rename(columns={'chairperson':'organizer'})
proceeding.drop(columns=cols, inplace=True)

display(conference, proceeding)

Unnamed: 0,conference,title,organizer,location,Start,End,year,conferenceSeries,type
0,976927f9-0db0-4946-925d-d113880b67d9-2002,2002 IFIP International Conference on e-Busine...,35463674,"Dayton, Ohio, United States",2002-10-07,2002-10-07,2002,"IFIP International Conference on e-Business, e...",expert group
1,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,2013 Australasian Universities Power Engineeri...,2132914,"Goiânia, Goiás, Brazil",2013-09-01,2013-09-01,2013,Australasian Universities Power Engineering Co...,workshop
2,0d6f7fba-7092-46b3-8039-93458dba736b-2012,2012 IEEE International Conference on Acoustic...,1410752027,"Goianésia, Goiás, Brazil",2012-03-25,2012-03-25,2012,"IEEE International Conference on Acoustics, Sp...",regular
3,047958df-6384-459e-9864-63f946419551-2011,2011 International Conference on Computer Scie...,119585726,"Federal, Entre Rios, Argentina",2011-06-10,2011-06-10,2011,International Conference on Computer Science a...,workshop
4,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,"2012 International Conference on Control, Auto...",2108384213,"Avignon, Provence-Alpes-Côte d'Azur, France",2012-01-01,2012-12-31,2012,"International Conference on Control, Automatio...",symposium
...,...,...,...,...,...,...,...,...,...
61,df9f7819-abf5-46f8-b6a8-6bd3261a21a5-2011,2011 Australasian Telecommunication Networks a...,2072520307,"Zavolzh’ye, Nizjnij Novgorod, Russia",2011-12-08,2011-12-08,2011,Australasian Telecommunication Networks and Ap...,symposium
62,0e129215-7c25-46c9-b04b-a0e9faabf021-2011,2011 International Conference on Information N...,99921433,"Mekla, Tizi Ouzou, Algeria",2011-03-03,2011-03-03,2011,International Conference on Information Networ...,symposium
63,b189dec0-41d0-4cea-a906-7c5186895904-2022,2022 Global Communications Conference,144009212,"Letňany, Praha, Czech Republic",2022-12-04,2022-12-04,2022,Global Communications Conference,regular
64,7431ff67-91dc-41fa-b322-1b1ca657025f-2022,2022 International Conference on Information a...,2107796763,"Montijo, Setúbal, Portugal",2022-01-27,2022-01-27,2022,International Conference on Information and Kn...,regular


Unnamed: 0,conference,proceeding,issn,publisher,published_date
0,976927f9-0db0-4946-925d-d113880b67d9-2002,976927f9-0db0-4946-925d-d113880b67d9-2002,095-37-7052,"Williams, Ramirez and Macias",2002-10-07
1,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,310-38-8465,Lewis PLC,2013-09-01
2,0d6f7fba-7092-46b3-8039-93458dba736b-2012,0d6f7fba-7092-46b3-8039-93458dba736b-2012,368-17-1336,Solomon Group,2012-03-25
3,047958df-6384-459e-9864-63f946419551-2011,047958df-6384-459e-9864-63f946419551-2011,227-23-4412,"Brown, Marsh and Holden",2011-06-10
4,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,118-90-2380,Hanna-Garcia,1979-04-06
...,...,...,...,...,...
61,df9f7819-abf5-46f8-b6a8-6bd3261a21a5-2011,df9f7819-abf5-46f8-b6a8-6bd3261a21a5-2011,,"Bryant, Harrison and Lee",2011-12-08
62,0e129215-7c25-46c9-b04b-a0e9faabf021-2011,0e129215-7c25-46c9-b04b-a0e9faabf021-2011,,Austin and Sons,2011-03-03
63,b189dec0-41d0-4cea-a906-7c5186895904-2022,b189dec0-41d0-4cea-a906-7c5186895904-2022,,"Frederick, Hayes and Stewart",2022-12-04
64,7431ff67-91dc-41fa-b322-1b1ca657025f-2022,7431ff67-91dc-41fa-b322-1b1ca657025f-2022,,Ford-Mcbride,2022-01-27


In [52]:
# Note: using volume id as volume as proceeding name
journal=(pd.read_csv('journal.csv').rename(columns={'id':'journal'})
            .merge(pd.read_csv('volume_of.csv').drop(columns=['fake']), on=['journal'])
           .merge(pd.read_csv('volume.csv').drop(columns=['volume']).rename(columns={'id':'volume'}).drop(columns=['fake']), on='volume')
           .drop_duplicates()
         .drop(columns=['url'])
         .rename(columns={'name':'title'})
        )
# Get published date
journal=(journal.merge(published[published.venue_type=='Journal']
                  .rename(columns={'venue':'journal','publication':'volume'}))
            .drop(columns=['venue_type'])
)

# generate more fake fields
journal['issn']=journal.issn.fillna(pd.Series([fake.ssn() for i in range(len(journal))]))
journal['publisher']=[fake.company() for i in range(len(journal))]
journal.drop(columns=['year'], inplace=True)

# Separate journal and volume
cols=['title','editor']
volume=journal.copy()
journal=journal[['journal']+cols].groupby(['journal','title']).head(1).reset_index(drop=True).rename(columns={'editor':'organizer'})
volume.drop(columns=cols, inplace=True)

display(journal, volume)

Unnamed: 0,journal,title,organizer
0,52df9a54-6cc3-4685-9826-f6ba927def1a,Nepalese journal of ophthalmology : a biannual...,6.748076e+06
1,9f840236-aa46-478e-98fe-68a1fe8b823e,Regenerative medicine,4.983612e+07
2,b6b26b43-2fef-41ad-98b9-af7ba33afa6b,Journal of the Medical Association of Thailand...,2.556517e+07
3,910f05b9-f423-44fc-9fc1-c6b3d2481fe0,Journal of Foot and Ankle Research,2.176073e+09
4,137df871-0be4-4ea4-9f85-52b2b36070a3,Journal of the American Dental Association,1.457783e+08
...,...,...,...
599,a00fde74-d8df-4613-b825-0fff9f531d3f,ArXiv,2.542754e+06
600,cbbe61ca-ba31-4f0a-b64c-49da7a69f20c,Hematology,2.911675e+08
601,b07ad337-ba9f-48ca-a0ff-59d1643601ac,Life sciences,7.456097e+08
602,e693dc83-bbed-4844-bfa8-a14861fdf715,Gene,4.225051e+08


Unnamed: 0,journal,issn,volume,published_date,publisher
0,52df9a54-6cc3-4685-9826-f6ba927def1a,2072-6805,52df9a54-6cc3-4685-9826-f6ba927def1a-12 24,2020-07-01,Walker-Smith
1,52df9a54-6cc3-4685-9826-f6ba927def1a,2072-6805,52df9a54-6cc3-4685-9826-f6ba927def1a-2020,2020-12-31,Anthony PLC
2,9f840236-aa46-478e-98fe-68a1fe8b823e,1524-0142,9f840236-aa46-478e-98fe-68a1fe8b823e-nan,2021-03-30,Kemp Inc
3,9f840236-aa46-478e-98fe-68a1fe8b823e,1524-0142,9f840236-aa46-478e-98fe-68a1fe8b823e-2021,2021-12-31,"Nixon, Young and Blevins"
4,b6b26b43-2fef-41ad-98b9-af7ba33afa6b,0125-2208,b6b26b43-2fef-41ad-98b9-af7ba33afa6b-97 Suppl 6,2014-06-01,Sosa-Duffy
...,...,...,...,...,...
1449,b07ad337-ba9f-48ca-a0ff-59d1643601ac,2252-6277,b07ad337-ba9f-48ca-a0ff-59d1643601ac-2022,2022-12-31,"Mckenzie, Robinson and Harvey"
1450,e693dc83-bbed-4844-bfa8-a14861fdf715,0378-1119,e693dc83-bbed-4844-bfa8-a14861fdf715-574 2,2015-12-15,Burns PLC
1451,e693dc83-bbed-4844-bfa8-a14861fdf715,0378-1119,e693dc83-bbed-4844-bfa8-a14861fdf715-2015,2015-12-31,Morse PLC
1452,21b36238-c30e-4cf2-aaff-afc89e023d0e,1742-2051,21b36238-c30e-4cf2-aaff-afc89e023d0e-5 4,2009-02-12,Hall-Martin


In [53]:
# Edit Paper/submission
# Get information about the chair/editor that assigned reviewers for that submission
org=pd.concat([conference[['conference','organizer']].assign(venue_type='Conference').rename(columns={'conference':'venue'}),
           journal[['journal','organizer']].assign(venue_type='Journal').rename(columns={'journal':'venue'})]).drop_duplicates()

paper=paper.merge(org, on=['venue_type','venue'], how='left')
paper

Unnamed: 0,paper,submitted_date,title,abstract,wordcount,type,doi,venue_type,venue,publication,decision,submission,organizer
0,2178047,2002-10-07,Applying External Solutions to Organizational ...,Inside establish imagine question there garden...,6190,poster,http://doi.org/7482469/94.13.49.12,Conference,976927f9-0db0-4946-925d-d113880b67d9-2002,976927f9-0db0-4946-925d-d113880b67d9-2002,True,sub-2178047,3.546367e+07
1,46711191,2013-09-01,A loop based approach to analytical multi-core...,This paper presents a loop based formulation f...,4204,short,http://doi.org/3086481/179.8.95.171,Conference,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,e2716c6a-50f4-4a85-be07-a76de507f09a-2013,True,sub-46711191,2.132914e+06
2,14154659,2012-03-25,Relay selection in multi-user amplify-forward ...,For multi-user (MU) amplify-and-forward (AF) c...,4089,full,http://doi.org/5227455/205.156.174.3,Conference,0d6f7fba-7092-46b3-8039-93458dba736b-2012,0d6f7fba-7092-46b3-8039-93458dba736b-2012,True,sub-14154659,1.410752e+09
3,14119063,2011-06-10,ID-based proxy re-signcryption scheme,"Combining the idea of signcryption, a proxy re...",4997,full,http://doi.org/563719/173.16.247.75,Conference,047958df-6384-459e-9864-63f946419551-2011,047958df-6384-459e-9864-63f946419551-2011,True,sub-14119063,1.195857e+08
4,45069334,1979-04-06,Autonomous control of running takeoff and land...,Behind air development professor. Manager know...,4899,short,http://doi.org/8340312/78.156.213.164,Conference,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,413493e7-4bb6-4c68-a57a-e21b1b3ca448-2012,True,sub-45069334,2.108384e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4325,257395327,2002-01-01,Institutions. The important forecasts because ...,"Calls this a shower, being heavier than hydrog...",6649,demo,http://doi.org/1616189/169.203.92.2,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395327,2.926423e+07
4326,257395328,2002-01-01,Parent plant and prompted the revolution that ...,Naval dockyards röntgen discovered. With prey ...,5210,demo,http://doi.org/6586262/93.30.52.186,Journal,ffaa4409-29fb-4245-a496-c51b151f9f5f,ffaa4409-29fb-4245-a496-c51b151f9f5f-2002,True,sub-257395328,2.926423e+07
4327,257395329,2020-01-01,Many intersections appellation eclipsed and ev...,Mariners and limited-convective patches. Parts...,6951,short,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395329,6.520401e+08
4328,257395330,2020-01-01,"And downtown, to Aswan and is now landfilled, ...","Don; williams, hills, mountains.. Learning dis...",4206,demo,,Journal,fff3549c-df24-4aef-accb-a33ae442a828,,False,sub-257395330,6.520401e+08


In [54]:
hasAuthor=pd.read_csv('writes.csv').drop(columns=['fake'])
hasAuthor

Unnamed: 0,author,paper
0,7.265495e+06,219410769
1,4.086158e+07,219410769
2,2.073785e+09,219410769
3,6.880566e+06,225065627
4,5.113959e+07,209854486
...,...,...
13584,2.151717e+06,257395330
13585,4.990294e+07,257395331
13586,2.470346e+06,257395331
13587,4.627105e+07,257395331


In [55]:
area=pd.read_csv('topic.csv', usecols=['community']).rename(columns={'community':'topicName'}).drop_duplicates().reset_index(drop=True)
area['area']='area-'+area.index.astype(str)
area

Unnamed: 0,topicName,area
0,Pure Science,area-0
1,Applied Science,area-1
2,Social Science,area-2
3,Database,area-3


In [56]:
hasTopic=[]
for df in ['paper','journal','volume','conference','proceeding']:
    hasTopic.append(globals()[df][[df]].rename(columns={df:'id'}).assign(typ=df))
hasTopic=pd.concat(hasTopic, ignore_index=True)
hasTopic['area']=[random.sample(list(area.area.unique()), 1)[0] for i in range(len(hasTopic))]
hasTopic=hasTopic.merge(area, on=['area'])
hasTopic

Unnamed: 0,id,typ,area,topicName
0,2178047,paper,area-3,Database
1,46711191,paper,area-3,Database
2,43103677,paper,area-3,Database
3,33450665,paper,area-3,Database
4,24366958,paper,area-3,Database
...,...,...,...,...
6515,0abc814b-4b33-4afd-bb15-61351136028f-2014,proceeding,area-1,Applied Science
6516,9039088a-b047-49bc-b12a-3334017ad93d-2013,proceeding,area-1,Applied Science
6517,8ae277fb-3346-4eab-9e31-c719d1d8bb3f-2016,proceeding,area-1,Applied Science
6518,61d8580e-d1fe-40b8-91de-be06508ce120-2017,proceeding,area-1,Applied Science


In [57]:
del org, published, submitted
gc.collect()

1229

# Define Graph

In [58]:
# Set up graph
from rdflib import Graph, Namespace, URIRef, Literal, RDF, XSD, FOAF, RDFS

g = Graph()
sdm = Namespace('http://example.org/sdm#')

g.bind("sdm", sdm)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)
g.bind("rdf", RDF)

NS = {
    'sdm': sdm,
    'rdf': RDF,
    'rdfs': RDFS,
    'xsd':XSD,
}

## Define TBOX

In [59]:
# rules of the graph
# Concepts: 

# Author concept 
g.add((sdm.Author,RDFS.subClassOf, sdm.Person))
# Organizer concept
g.add((sdm.Organizer,RDFS.subClassOf, sdm.Author))
# Chair concept
g.add((sdm.Chair,RDFS.subClassOf, sdm.Organizer))
# Editor concept
g.add((sdm.Editor,RDFS.subClassOf, sdm.Organizer))

# Demo Paper concept 
g.add((sdm.DemoPaper,RDFS.subClassOf, sdm.Paper))

# Full Paper concept 
g.add((sdm.FullPaper,RDFS.subClassOf, sdm.Paper))

# Short Paper concept 
g.add((sdm.ShortPaper,RDFS.subClassOf, sdm.Paper))

# Poster concept 
g.add((sdm.Poster,RDFS.subClassOf, sdm.Paper))

# Proceeding concept 
g.add((sdm.Proceeding,RDFS.subClassOf, sdm.Publication))

# Volume concept 
g.add((sdm.Volume,RDFS.subClassOf, sdm.Publication))

# Journal concept 
g.add((sdm.Journal,RDFS.subClassOf, sdm.Venue))

# Conference concept 
g.add((sdm.Conference,RDFS.subClassOf, sdm.Venue))

# Workshop concept 
g.add((sdm.Workshop,RDFS.subClassOf, sdm.Conference))

# Regular Conference concept 
g.add((sdm.RegularConference,RDFS.subClassOf, sdm.Conference))

# Regular Conference concept 
g.add((sdm.RegularConference,RDFS.subClassOf, sdm.Conference))

# Synposium concept 
g.add((sdm.Symposium,RDFS.subClassOf, sdm.Conference))

# Expert Group concept 
g.add((sdm.ExpertGroup,RDFS.subClassOf, sdm.Conference))


<Graph identifier=N4237566f2c8644e298d0d4bfc15bb925 (<class 'rdflib.graph.Graph'>)>

In [60]:
# Properties 
# Range is Author 
# hasAuthor property
g.add((sdm.hasAuthor,RDFS.domain, sdm.Paper))
g.add((sdm.hasAuthor,RDFS.range, sdm.Author))

# hasReviewer
g.add((sdm.hasReviewer,RDFS.domain, sdm.Review))
g.add((sdm.hasReviewer,RDFS.range, sdm.Author))

# Range is Organizer 
# assignedBy property 
g.add((sdm.assignedBy,RDFS.domain, sdm.Submission))
g.add((sdm.assignedBy,RDFS.range, sdm.Organizer))

# hasOrganizer property 
g.add((sdm.hasOrganizer,RDFS.domain, sdm.Venue))
g.add((sdm.hasOrganizer,RDFS.range, sdm.Organizer))

# Range is Review 
# hasReview property
g.add((sdm.hasReview,RDFS.domain, sdm.Submission))
g.add((sdm.hasReview,RDFS.range, sdm.Review))

# Range is Submission
# includedIn property
g.add((sdm.includedIn,RDFS.domain, sdm.Paper))
g.add((sdm.includedIn,RDFS.range, sdm.Submission))

# Range is Area 
# paperRelatedTo property
g.add((sdm.paperRelatedTo,RDFS.subPropertyOf, sdm.relatedTo))

g.add((sdm.paperRelatedTo,RDFS.domain, sdm.Paper))
g.add((sdm.paperRelatedTo,RDFS.range, sdm.Area))

# venueRelatedTo property
g.add((sdm.venueRelatedTo,RDFS.subPropertyOf, sdm.relatedTo))

g.add((sdm.venueRelatedTo,RDFS.domain, sdm.Venue))
g.add((sdm.venueRelatedTo,RDFS.range, sdm.Area))

# publicationRelatedTo property
g.add((sdm.publicationRelatedTo,RDFS.subPropertyOf, sdm.relatedTo))

g.add((sdm.publicationRelatedTo,RDFS.domain, sdm.Publication))
g.add((sdm.publicationRelatedTo,RDFS.range, sdm.Area))

# Range is Publication 
# publishedIn property
g.add((sdm.publishedIn,RDFS.domain, sdm.Paper))
g.add((sdm.publishedIn,RDFS.range, sdm.Publication))

# hasPublished property 
g.add((sdm.hasPublished,RDFS.domain, sdm.Venue))
g.add((sdm.hasPublished,RDFS.range, sdm.Publication))

# Range is Proceeding 
# posterPublishedIn property
g.add((sdm.posterPublishedIn,RDFS.subPropertyOf, sdm.publishedIn))


g.add((sdm.posterPublishedIn,RDFS.domain, sdm.Poster))
g.add((sdm.posterPublishedIn,RDFS.range, sdm.Proceeding))

<Graph identifier=N4237566f2c8644e298d0d4bfc15bb925 (<class 'rdflib.graph.Graph'>)>

In [61]:
## Data properties

## Person concept data properties
## hasPersonName
g.add((sdm.hasPersonName,RDFS.domain, sdm.Person))
g.add((sdm.hasPersonName,RDFS.range, XSD.string))

## hasBirthDate 
g.add((sdm.hasBirthDate,RDFS.domain, sdm.Person))
g.add((sdm.hasBirthDate,RDFS.range, XSD.date))

## hasSex 
g.add((sdm.hasSex,RDFS.domain, sdm.Person))
g.add((sdm.hasSex,RDFS.range, XSD.string))

## originCountry 
g.add((sdm.originCountry,RDFS.domain, sdm.Person))
g.add((sdm.originCountry,RDFS.range, XSD.string))

## Author concept data properties 
## hasHIndex 
g.add((sdm.hasHIndex,RDFS.domain, sdm.Author))
g.add((sdm.hasHIndex,RDFS.range, XSD.float))

## url
g.add((sdm.url,RDFS.domain, sdm.Author))
g.add((sdm.url,RDFS.range, XSD.string))

## affiliatedWithInstitution
g.add((sdm.affiliatedWithInstitution,RDFS.domain, sdm.Author))
g.add((sdm.affiliatedWithInstitution,RDFS.range, XSD.string))


## Review concept data properties 
## decision
g.add((sdm.decision,RDFS.domain, sdm.Review))
g.add((sdm.decision,RDFS.range, XSD.integer))

## content 
g.add((sdm.content,RDFS.domain, sdm.Review))
g.add((sdm.content,RDFS.range, XSD.string))

## reviewDate
g.add((sdm.reviewDate,RDFS.domain, sdm.Review))
g.add((sdm.reviewDate,RDFS.range, XSD.date))

## Submission concept data properties 
## submissionDate
g.add((sdm.submissionDate,RDFS.domain, sdm.Submission))
g.add((sdm.submissionDate,RDFS.range, XSD.date))

## Venue concept data properties
## hasVenueTitle
g.add((sdm.hasVenueTitle,RDFS.domain, sdm.Venue))
g.add((sdm.hasVenueTitle,RDFS.range, XSD.string))

## Conference concept data properties 
## conferenceSeries
g.add((sdm.conferenceSeries,RDFS.domain, sdm.Conference))
g.add((sdm.conferenceSeries,RDFS.range, XSD.string))

## startDate
g.add((sdm.startDate,RDFS.domain, sdm.Conference))
g.add((sdm.startDate,RDFS.range, XSD.date))

## endDate
g.add((sdm.endDate,RDFS.domain, sdm.Conference))
g.add((sdm.endDate,RDFS.range, XSD.date))

## heldIn
g.add((sdm.heldIn,RDFS.domain, sdm.Conference))
g.add((sdm.heldIn,RDFS.range, XSD.string))

## heldInYear
g.add((sdm.heldInYear,RDFS.domain, sdm.Conference))
g.add((sdm.heldInYear,RDFS.range, XSD.integer))

## Publication concept data properties
## publicationIssn
g.add((sdm.publicationIssn,RDFS.domain, sdm.Publication))
g.add((sdm.publicationIssn,RDFS.range, XSD.string))

## publisher 
g.add((sdm.publisher,RDFS.domain, sdm.Publication))
g.add((sdm.publisher,RDFS.range, XSD.string))

## publishedDate
g.add((sdm.publishedDate,RDFS.domain, sdm.Publication))
g.add((sdm.publishedDate,RDFS.range, XSD.date))

## Paper concept data properties 
## paperTitle
g.add((sdm.paperTitle,RDFS.domain, sdm.Paper))
g.add((sdm.paperTitle,RDFS.range, XSD.string))

## paperWordCount
g.add((sdm.paperWordCount,RDFS.domain, sdm.Paper))
g.add((sdm.paperWordCount,RDFS.range, XSD.integer))

## paperAbstract 
g.add((sdm.paperAbstract,RDFS.domain, sdm.Paper))
g.add((sdm.paperAbstract,RDFS.range, XSD.string))

## paperDOI
g.add((sdm.paperDOI,RDFS.domain, sdm.Paper))
g.add((sdm.paperDOI,RDFS.range, XSD.string))

## Area concept data properties 
## hasTopicName
g.add((sdm.hasTopicName,RDFS.domain, sdm.Area))
g.add((sdm.hasTopicName,RDFS.range, XSD.string))

<Graph identifier=N4237566f2c8644e298d0d4bfc15bb925 (<class 'rdflib.graph.Graph'>)>

## Define ABOX

In [62]:
import re
from datetime import datetime

# Helper functions
def prepareValue(row, uri=sdm):
    """
    Function that prepares the values to be added to the graph as a URI or Literal
    source: https://wiki.uib.no/info216/index.php/Python_Examples#RDF_programming_with_RDFlib_.28Lab_2.29
    Input: row value 
    Output: Converted URI or literal
    """
    if row == None:  # none type
        value = Literal(row)
    elif (isinstance(row, str) and re.match(r'\d{4}-\d{2}-\d{2}', row)) or isinstance(row, datetime):  # date
        value = Literal(row, datatype=XSD.date)
    elif isinstance(row, bool):  # boolean value (true / false)
        value = Literal(row, datatype=XSD.boolean)
    elif isinstance(row, int):  # integer
        value = Literal(row, datatype=XSD.integer)
    elif isinstance(row, str):  # string
#         # Use this if saving as URI
#         pattern='^((http|https)://)[-a-zA-Z0-9@:%._\\+~#?&//=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%._\\+~#?&//=]*)$'
#         value=URIRef(uri+re.sub(pattern,'_',row.replace('\n','_').replace('\t','_').replace(",", "").replace("-", "_").replace('"', '').replace('\\', '')).replace(" ",'_') )
        # if normal string
        value = Literal(row, datatype=XSD.string)
    elif isinstance(row, float):  # float
        value = Literal(row, datatype=XSD.float)
    return value

def to_camel_case(text):
    """
    Convert string to camel case (no spaces)
    """
    x=[i for i in text]
    return ''.join(sum([],[x[0].upper()]+x[1:]))

In [63]:
# General cLean up of all dfs
df_list= %who_ls DataFrame
for df in df_list:
    print(df)
    # replace nulls with None
    globals()[df]=globals()[df].replace(np.nan, None)
    
    # Make all date columns into datetime
    dcols=[i for i in globals()[df].columns if 'date' in i.lower() or i.endswith('_dt')]
    for col in dcols:
        globals()[df][col]=pd.to_datetime(globals()[df][col])

area
author
conference
hasAuthor
hasTopic
journal
paper
proceeding
review
volume


In [64]:
# Convert the non-semantic CSV dataset into a semantic RDF
def area_to_rdf(df):
    """
    Concepts: Area
    """
    for index, row in df.iterrows():
        id = URIRef(sdm + "Area_" + str(row['area']))
        name = prepareValue(row["topicName"])
        
        # Adds the triples
#         g.add((id, RDF.type, sdm.Area))
        g.add((id, sdm.hasTopicName, name))
        
    print('Done: Area')
        
def author_to_rdf(df):
    """
    Concepts: Person, Author
    """
    for index, row in df.iterrows():
        # define values
        #id = URIRef(sdm + "Person_" + str(row['author']))
        id = URIRef(sdm + "Author_" + str(row['author']))
        name = prepareValue(row["name"])
        birthdate = prepareValue(row["birthdate"])
        sex = prepareValue(row["sex"])
        country = prepareValue(row["originCountry"])
        
        # Adds the triples
#         g.add((id, RDF.type, sdm.Author))
        g.add((id, sdm.hasPersonName, name))
        g.add((id, sdm.hasBirthDate, birthdate))
        g.add((id, sdm.hasSex, sex))
        g.add((id, sdm.originCountry, country))
        
        # Author
        #id = URIRef(sdm + "Author_" + str(row['author']))
        url = prepareValue(row["url"])
        hindex = prepareValue(row["hIndex"])
        institution = prepareValue(row["institution"])
                
        # Adds the triples
        g.add((id, sdm.url, url))
        g.add((id, sdm.hasHIndex, hindex))
        g.add((id, sdm.affiliatedWithInstitution, institution))
    print('Done: Author')

def conference_to_rdf(df):
    """
    Concepts: Conference
    Relationships: hasOrganizer
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Conference_" + str(row['conference']))
        conf_type={'expert group':sdm.ExpertGroup, 'symposium':sdm.Symposium, 
                   'workshop':sdm.Workshop, 'regular':sdm.RegularConference}[row['type']]
        title = prepareValue(row["title"])
        location = prepareValue(row["location"])
        start = prepareValue(row["Start"])
        end = prepareValue(row["End"])
        year = prepareValue(row["year"])
        conferenceSeries = prepareValue(row["conferenceSeries"])
        
        # Adds the triples 
#         g.add((id, RDF.type, conf_type))
        g.add((id, sdm.hasVenueTitle, title))
        g.add((id, sdm.heldIn, location))
        g.add((id, sdm.startDate, start))
        g.add((id, sdm.endDate, end))
        g.add((id, sdm.heldInYear, year))
        g.add((id, sdm.conferenceSeries, conferenceSeries))
        
        # Relationships
        author_org=URIRef(sdm + "Author_" + str(row['organizer']))
        #author_org=URIRef(sdm + str(row['organizer']))
        
        # Adds the triples 
        g.add((author_org, RDF.type, sdm.Chair))
        g.add((id, sdm.hasOrganizer, author_org))
    print('Done: Conference')
        
def journal_to_rdf(df):
    """
    Concepts: Journal
    Relationships: hasOrganizer
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Journal_" + str(row['journal']))
        title = prepareValue(row["title"])
        
        # Adds the triples 
        g.add((id, RDF.type, sdm.Journal))
        g.add((id, sdm.hasVenueTitle, title))
        
        # Relationships
        author_org=URIRef(sdm + "Author_" + str(row['organizer']))
        #author_org=URIRef(sdm + str(row['organizer']))

        
        # Adds the triples 
        g.add((author_org, RDF.type, sdm.Editor))
        g.add((id, sdm.hasOrganizer, author_org))
    print('Done: Journal')

def volume_to_rdf(df):
    """
    Concepts: Volume
    Relationships: hasPublished
    NOTE: Used Venue > Publication relationship. note that Volume URIs replace spaces with _
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Volume_" + str(row['volume']).replace(' ','_'))
        jid= URIRef(sdm + "Journal_" + str(row['journal']))
        issn = prepareValue(row["issn"])
        published_date = prepareValue(row["published_date"])
        publisher = prepareValue(row["publisher"])
        
        # Adds the triples
        g.add((id, RDF.type, sdm.Volume))
        g.add((id, sdm.publicationIssn, issn))
        g.add((id, sdm.publishedDate, published_date))
        g.add((id, sdm.publisher, publisher))
        
        # Relationship
        g.add((jid, sdm.hasPublished, id))
    print('Done: Volume')

def proceeding_to_rdf(df):
    """
    Concepts: proceeding
    Relationships: hasPublished
    NOTE: Used Venue > Publication relationship
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Proceeding_" + str(row['proceeding']))
        cid= URIRef(sdm + "Conference_" + str(row['conference']))
        issn = prepareValue(row["issn"])
        published_date = prepareValue(row["published_date"])
        publisher = prepareValue(row["publisher"])
        
        # Adds the triples
        g.add((id, RDF.type, sdm.Proceeding)) 
        g.add((id, sdm.publicationIssn, issn))
        g.add((id, sdm.publishedDate, published_date))
        g.add((id, sdm.publisher, publisher))
        
        # Relationship
        g.add((cid, sdm.hasPublished, id))
    print('Done: Proceeding')

def paper_to_rdf(df):
    """
    Concepts: Paper, Submission
    Relationships: includedIn, publishedIn, assignedBy
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Paper_" + str(row['paper']))
        sid = URIRef(sdm + "Submission_" + str(row['submission']))
        oid=URIRef(sdm + 'Author_' + str(row['organizer']))
        #oid=URIRef(sdm + str(row['organizer']))
        paper_type={'demo':sdm.DemoPaper, 'full':sdm.FullPaper, 'short':sdm.ShortPaper, 'poster':sdm.Poster}[row['type']]
        
        for col in df.columns:
            locals()[col]=prepareValue(row[col])
            
        # Paper properties
        g.add((id,RDF.type, paper_type))
        g.add((id,sdm.paperAbstract, locals()['abstract']))
        g.add((id,sdm.paperTitle, locals()['title']))
        g.add((id,sdm.paperWordCount, locals()['wordcount']))

        # Submission properties
        g.add((sid,sdm.submissionDate, locals()['submitted_date']))

        # Relationships
        g.add((id,sdm.includedIn,sid))
        g.add((sid, sdm.assignedBy, oid))
        
        # Conditional property and relationship, only add if paper decision is true (published)
        if row['decision']:
            pid=URIRef(sdm + {'Conference':'Proceeding_','Journal':'Volume_'}[row['venue_type']] + str(row['publication']).replace(' ','_'))
            g.add((id,sdm.paperDOI, locals()['doi']))
            if [row['type']]!='poster':
                g.add((id,sdm.publishedIn,pid))
            else:
                g.add((id,sdm.posterPublishedIn,pid))
    print('Done: Paper')

def review_to_rdf(df):
    """
    Concepts: Review
    Relationships: hasReviewer, hasReview
    """
    for index, row in df.iterrows():
        # define values
        id = URIRef(sdm + "Review_" + str(row['review']))
        sid = URIRef(sdm + "Submission_" + str(row['submission']))
        rid=URIRef(sdm + 'Author_' + str(row['reviewer']))
        #rid=URIRef(sdm + str(row['reviewer']))
        
        for col in df.columns:
            locals()[col]=prepareValue(row[col])
            
        # Paper properties
        g.add((id,sdm.decision, locals()['decision']))
        g.add((id,sdm.content, locals()['content']))
        g.add((id,sdm.reviewDate, locals()['reviewDate']))

        # Relationships
        g.add((id,sdm.hasReviewer,rid))
        g.add((sid, sdm.hasReview, id))
    print('Done: Review')
        
def hasauthor_to_rdf(df):
    """
    Relationships: hasAuthor
    """
    for index, row in df.iterrows():
        # define values
        pid = URIRef(sdm + "Paper_" + str(row['paper']))
        aid = URIRef(sdm + "Author_" + str(row['author']))
        #aid = URIRef(sdm + str(row['author']))

        # Relationships
        g.add((pid,sdm.hasAuthor,aid))
    print('Done: hasAuthor')
        
def hastopic_to_rdf(df):
    """
    Relationships: paperRelatedTo, venueRelatedTo, publicationRelatedTo
    """
    for index, row in df.iterrows():
        # define values
        pid = URIRef(sdm + to_camel_case(row['typ'])+'_' + str(row['id']).replace(' ','_'))
        aid = URIRef(sdm + "Area_" + str(row['area']))
        rel={'paper':sdm.paperRelatedTo, 'journal':sdm.venueRelatedTo, 'volume':sdm.publicationRelatedTo, 
             'conference':sdm.venueRelatedTo, 'proceeding':sdm.publicationRelatedTo}[row['typ']]

        # Relationships
        g.add((pid,rel,aid))
        
    print('Done: hasTopic')

In [65]:
area_to_rdf(area)
author_to_rdf(author)
conference_to_rdf(conference)
journal_to_rdf(journal)
volume_to_rdf(volume)
proceeding_to_rdf(proceeding)
paper_to_rdf(paper) 
review_to_rdf(review) 
hasauthor_to_rdf(hasAuthor)
hastopic_to_rdf(hasTopic)

Done: Area
Done: Author
Done: Conference
Done: Journal
Done: Volume
Done: Proceeding
Done: Paper
Done: Review
Done: hasAuthor
Done: hasTopic


# Export

In [66]:
os.chdir(savefolder)

# # serialize without inference
# g.serialize(destination='output_graph.rdf',format="xml")

<Graph identifier=N4237566f2c8644e298d0d4bfc15bb925 (<class 'rdflib.graph.Graph'>)>

In [67]:
import owlrl

# serialize with inference
engine = owlrl.RDFSClosure.RDFS_Semantics(g,False,False,False)
engine.closure()
engine.flush_stored_triples()
g.serialize(destination='output_graph_inference.rdf',format="xml")
#print(g.serialize())

  if lt1.value == lt2.value


<Graph identifier=N4237566f2c8644e298d0d4bfc15bb925 (<class 'rdflib.graph.Graph'>)>