In [None]:
import pandas as pd
import requests
import rdflib
from rdflib import URIRef
from rdflib.namespace import RDF
import xml.etree.ElementTree as ET
import networkx as nx
import regex as re
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import ast

In [None]:
df = pd.read_csv('data/person_review_name.csv', sep='\s*,\s*', engine='python')

In [None]:
# Entrée : URL / Sortie : return(XML)
def rdf_to_xml(url):
    try:
        g = rdflib.Graph().parse(url)
        serial = g.serialize(format='pretty-xml')
        root = ET.fromstring(serial)
        return(root)
    except:
        print('Fin du dataframe ?')

# Entrée : liste d'URL / Sortie : liste de noms, d'URL en cas d'erreur
def url_to_name(cites_urls):
    cites_names = []
    id_publishs = []
    for cite in cites_urls:
        url = cite
        root = rdf_to_xml(url)
        try:
            id_publish = (re.findall('(?<=persee.fr\/doc\/)[a-z]+(?=_[0-9]+)', url))[0]
            id_publishs.append(id_publish)

            bib_cit = root.find('.//{http://purl.org/dc/terms/}bibliographicCitation').text
            name = (re.findall('^[^.]+', bib_cit))[0]
            cites_names.append(name)
        except:
            cites_names.append(url)
            id_publishs.append(id_publish)
    return(cites_names, id_publishs)

In [None]:
# net_meta = nx.Graph()
net_meta = nx.DiGraph()

nb_a_traiter = len(df.index)

for i in range(nb_a_traiter):
# for i in range(100):
    url_df = df.loc[i]['Review_18']
    root = rdf_to_xml(url_df)

    bib_cit = root.find('.//{http://purl.org/dc/terms/}bibliographicCitation').text
    
    name_df = df.loc[i]['name_86']
    name_rdf_bib = (re.findall('^[^.]+', bib_cit))[0]

    # if fuzz.token_sort_ratio(name_df, name_rdf_bib) >= 90:
    #     print(f'{name_df} = {name_rdf_bib}')

    cites_url = []
    isCitedBy_url = []

    for elem in root.iter('{http://purl.org/spar/cito/}cites'):
        cite_url = list(elem.attrib.values())[0]
        if cite_url not in cites_url:
            if cite_url != url_df:
                cites_url.append(cite_url)
    if cites_url:
        cites_names = (url_to_name(cites_url))[0]
        id_publishs = (url_to_name(cites_url))[1]

    # for elem in root.iter('{http://purl.org/spar/cito/}isCitedBy'):
    #     cite_url = list(elem.attrib.values())[0]
    #     if cite_url not in isCitedBy_url:
    #         if cite_url != url_df:
    #             print(f'Ça marche pour {i}')
    #             isCitedBy_url.append(cite_url)
    # if isCitedBy_url:
    #     isCitedBy_names = url_to_name(isCitedBy_url)

    if name_rdf_bib not in net_meta:
        net_meta.add_node(name_rdf_bib, corpus=True, publishing='jds', nPaper=1) #id=url, bib_cit=bib_cit, nCitation=0)
    else:
        net_meta.nodes[name_rdf_bib]["nPaper"] += 1

    for name_or_url, id_publish in zip(cites_names, id_publishs):
        if name_or_url not in net_meta:
            net_meta.add_node(name_or_url, corpus=False, publishing=id_publish, nPaper=1)
        if net_meta.has_edge(name_rdf_bib, name_or_url):
            net_meta[name_rdf_bib][name_or_url]["weight"] += 1
        else:
            net_meta.add_edge(name_rdf_bib, name_or_url, weight=1)


In [None]:
print(len(net_meta))
nx.draw(net_meta)
nx.write_gexf(net_meta, f'graph_meta_{nb_a_traiter}.gexf')