In [None]:
%pip install rdflib
%pip install SPARQLWrapper

In [None]:
import requests
import re
from bs4 import BeautifulSoup
import wikipedia as wkp
import os
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, FOAF 
from pprint import pprint
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import Counter

In [None]:
def get_rdf_triples(name):
    """Function that take the name of a person and return its triples, to be able to save them later"""
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    encoded_name = name.replace(' ', '_')
   
    query = f"""
    SELECT DISTINCT ?subject ?predicate ?object
    WHERE {{
      {{
        ?subject ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
      UNION
      {{
        ?object ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
    }}
    LIMIT 100
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    triples = []
    for result in results["results"]["bindings"]:
        predicate = result["predicate"]["value"]
        obj = result["object"]["value"]
        triples.append({'subject': f"http://dbpedia.org/resource/{encoded_name}", 'predicate': predicate, 'object': obj})
    return triples

In [None]:
def save_rdf_triples(list_names, output_dir):
    """Function that save the triples in one file per person"""
    category_dir = os.path.join(output_dir)
    os.makedirs(category_dir, exist_ok=True)

    for name in list_names:
        try:
            triples = get_rdf_triples(name)

            if triples:
                output_file = os.path.join(category_dir, f"{name}.json")
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(triples, f, indent=4)
            else:
                print(f"No RDF triples found for {name}")

        except Exception as e:
            print(f"Error processing {name}: {e}")

In [None]:
#Open the Mina's json file for the names list, put them in a list and clean them a bit

with open(r'C:\Users\Elise\Downloads\names100_list.json', 'r') as f:
    content = json.load(f)

keys = list(content.keys())
fem_math = keys[0]
contem_art = keys[1]
mathematician_list_names = content[fem_math]
artist_list_names = content[contem_art]

for x in mathematician_list_names:
    math_cleaned_names = [re.sub(r'\s*\(.*?\)', '', x) for x in mathematician_list_names]

for x in artist_list_names:
    art_cleaned_names = [re.sub(r'\s*\(.*?\)', '', x) for x in artist_list_names]

print(math_cleaned_names, '\n', art_cleaned_names)

In [None]:
save_rdf_triples(math_cleaned_names, 'Female_mathematicians_rdf')
save_rdf_triples(art_cleaned_names, 'Contemporary_artists_rdf')

In [None]:
#function to get all the facts from the objects of the graphs 

def get_facts(rdf_directory, name_list):
    """Function that take the directory created in save_rdf_triples and a list a names
    to return the facts (part of the objects of the rdf graph) as a dictionary containing 
    as keys the names and as values the facts associated to it"""
    directory_path = fr"{os.getcwd()}\{rdf_directory}"

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Load the content of the JSON file
            content = json.load(file)              #iterate inside json files and load content

        list_obj = []
        for dicts in content:    # iterate inside the file (list of dicts)
            for obj in dicts:    # iterate inside the dicts of one file
                keys = list(dicts.keys())    #obtain list of the keys of a dict
                object_url = dicts[keys[2]]  #get the third value of a dict (the object)
                list_obj.append(object_url)  #put all objets in a list

        list_facts = [o.split('/')[-1] for o in list_obj]  #iterate inside the list of object to put the facts in a new list

    for name in name_list:
        facts[f"{name}"] = list_facts

    print(facts)
    return(facts)

In [None]:
math_dict = get_facts("Female_mathematicians_rdf", math_cleaned_names)
art_dict = get_facts("Contemporary_artists_rdf", art_cleaned_names)

In [None]:


##add graph parsers

properties = set()

for s, p, o in g:
    properties.add(p)

properties_list = list(properties)
#print(properties_list)

counter = Counter(properties_list)
most_common_elements = counter.most_common(30)

for element, count in most_common_elements:
    print(f'Element: {element}, Count: {count}')
    
most_common_values = [element for element, count in most_common_elements]
print(most_common_values)