# Part 1.1

## Libraries to import

In [2]:
import random
import pandas as pd
import re
import json
import requests
from bs4 import BeautifulSoup
ua = {'User-agent': 'Mozilla/5.0'}

In [3]:
import wikipedia as wkp
import os
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, FOAF
from pprint import pprint
from SPARQLWrapper import SPARQLWrapper, JSON

## Data collection function

### Auxiliary functions for text

In [4]:
def get_page_names(url, se_index) :
  """Function that gets all page names for pages about a category of people"""
  page = requests.get(url, headers=ua)
  soup = BeautifulSoup(page.content, 'html.parser')

  #Getting all the information on other wikipedia pages refered to on this page
  links = soup.find_all(href=re.compile("/wiki/"))

  #Getting the names of these pages
  names = []
  for link in links :
    names.append(link.get('title'))

  #Selecting only the ones about the people of interest (already grouped together and ordered alphabetically on the page)
  s_index = names.index(se_index[0])
  e_index = names.index(se_index[1])
  people = names[s_index:e_index+1]

  #final cleaning of the list(deleting links attached to figure captions that are in the middle of the list)
  figcaption = soup.find_all('figcaption')
  unwanted = []
  title = re.compile(r'(?<=title=").+?(?=")')
  for caption in figcaption :
    unwanted = unwanted + title.findall(str(caption))
  unwanted = list(set(unwanted))
  for i in range(len(people)) :
    if people[i] in unwanted :
      people[i] = None
  while None in people :
    people.remove(None)

  return people


def hundred(names) :
  random.shuffle(names)
  names100 = names[:100]
  return names100


def get_text(names) :
  texts = []
  for name in names :
    wpage = wkp.WikipediaPage(title=name)
    texts.append(wpage.content)
  return texts


#Creating the file names
def create_text_files(names100, data, category, output_dir) :
  file_names = []
  for name in names100 :
    file_names.append(name.replace(' ', '_'))

  category_dir = os.path.join(output_dir)
  os.makedirs(category_dir, exist_ok=True)

  for j in range(len(file_names)) :
    output_file = os.path.join(category_dir, f"{file_names[j]}.txt")
    with open(output_file, 'w') as f:
      f.write(data[category][j])

### Auxiliary functions for graphs

In [5]:
def get_rdf_triples(name):
    """Function that takes the name of a person and returns its triples, to be able to save them later"""
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    encoded_name = name.replace(' ', '_')

    query = f"""
    SELECT DISTINCT ?subject ?predicate ?object
    WHERE {{
      {{
        ?subject ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
      UNION
      {{
        ?object ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
    }}
    LIMIT 100
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    triples = []
    for result in results["results"]["bindings"]:
        predicate = result["predicate"]["value"]
        obj = result["object"]["value"]
        triples.append({'subject': f"http://dbpedia.org/resource/{encoded_name}", 'predicate': predicate, 'object': obj})
    return triples

In [6]:
def save_rdf_triples(list_names, output_dir):
    """Function that saves the triples in one file per person using get_rdf_triples"""
    category_dir = os.path.join(output_dir)
    os.makedirs(category_dir, exist_ok=True)

    for name in list_names:
        try:
            triples = get_rdf_triples(name)

            if triples:
                output_file = os.path.join(category_dir, f"{name}.json")
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(triples, f, indent=4)
            else:
                print(f"No RDF triples found for {name}")

        except Exception as e:
            print(f"Error processing {name}: {e}")

In [7]:
def get_facts(rdf_directory, name_list):
    """Function that takes the directory created in save_rdf_triples and a list of names
    to return the facts (part of the objects of the rdf graph) as a dictionary containing
    as keys the names and as values the facts associated with it"""

    directory_path = fr"{os.getcwd()}/{rdf_directory}"

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = json.load(file)
        list_obj = []
        for dicts in content:
            list_obj.append(dicts['object'])

        list_facts = [o.split('/')[-1] for o in list_obj]

    facts = {}
    for name in name_list:
        facts[f"{name}"] = list_facts

    return facts

### General main function

In [8]:
text_extraction = {"wikilink" : ["https://en.wikipedia.org/wiki/List_of_women_in_mathematics", "https://en.wikipedia.org/wiki/List_of_contemporary_artists"],
                   "category" : ["Female_mathematicians", "Contemporary_artists"],
                   "index" : [('Karen Aardal', 'Magdolna Zimányi'), ('Martine Aballéa', 'Ricardo Estanislao Zulueta')],
                   "dir_name" : ['Female_mathematicians_txt', 'Contemporary_artists_txt']}

graph_extraction = {"dbpedia_link" : [fr'https://dbpedia.org/describe/?url=http%3A%2F%2Fdbpedia.org%2Fresource%2FList_of_women_in_mathematics&sid=74483', fr'https://dbpedia.org/describe/?url=http%3A%2F%2Fdbpedia.org%2Fresource%2FList_of_contemporary_artists&go=Go&distinct=1&invfp=IFP_OFF&sas=SAME_AS_OFF&p=18&sid=74483&lp=19&op=16&gp=1'],
                    "dir_name" : ['Female_mathematicians_rdf', 'Contemporary_artists_rdf']}

In [9]:
#FINAL FUNCTION REGROUPING EVERYTHING

def text_graph_collection(text_extraction, graph_extraction) :

  #Text extraction

  data = {}
  names100 = []

  for i in range(len(text_extraction["wikilink"])) :
    page_names = get_page_names(text_extraction["wikilink"][i], text_extraction["index"][i])

    #random selecting of 100 links (to help with execution time and storage)
    names100.append(hundred(page_names))

    #Getting the text content of the pages
    data[text_extraction["category"][i]] = get_text(names100[i])

    #Storing the text files
    create_text_files(names100[i], data, text_extraction["category"][i], text_extraction["dir_name"][i])

  #Storing all the raw texts into one json file
  with open("./text_data.json", "w") as f:
    json.dump(data, f)

  #Creating the dataframe
  df = pd.DataFrame(data)


  #Graph extraction

  g = rdflib.Graph()
  facts = {}

  for i in range(len(graph_extraction["dbpedia_link"])) :

    #Building the graphs
    g.parse(graph_extraction["dbpedia_link"][i])

    #Storing the triples
    cleaned_names = [re.sub(r'\s*\(.*?\)', '', x) for x in names100[i]]
    save_rdf_triples(cleaned_names, graph_extraction["dir_name"][i])

    #Saving the facts into a dictionary
    name = text_extraction["category"][i]
    facts[f"{name}"] = get_facts(graph_extraction["dir_name"][i], cleaned_names)

  #Storing the facts into one json file
  with open("./facts.json", "w") as f:
    json.dump(facts, f)


  return data, df, facts

In [10]:
data, df, facts = text_graph_collection(text_extraction, graph_extraction)

No RDF triples found for Evelyn Prescott Wiggin
Error processing Elizabeth B. Dussan V.: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 6: syntax error at '.' before '?subject'\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\n    SELECT DISTINCT ?subject ?predicate ?object\n    WHERE {\n      {\n        ?subject ?predicate dbr:Elizabeth_B._Dussan_V. .\n        ?subject ?predicate ?object .\n      }\n      UNION\n      {\n        ?object ?predicate dbr:Elizabeth_B._Dussan_V. .\n        ?subject ?predicate ?object .\n      }\n    }\n    LIMIT 100\n    \n"
No RDF triples found for Bettina Richmond
No RDF triples found for Ingeborg Seynsche
No RDF triples found for Élisabeth Gassiat
No RDF triples found for Cynthia Y. Young
No RDF triples found for Almut Burchard
No RDF triples found for Maria Laura Moura Mouzinho Leite Lopes
No RDF triples found for R

In [11]:
df

Unnamed: 0,Female_mathematicians,Contemporary_artists
0,Éva Tardos (born 1 October 1957) is a Hungaria...,Seb Toussaint is a Franco-British artist and m...
1,Deborah Loewenberg Ball is an educational rese...,Mark Wallinger (born 25 May 1959) is an Englis...
2,Olivia Caramello is an Italian mathematician. ...,"Jevel Demikovski (March 27, 1922 – February 4,..."
3,Nicole Spillane (born 2 January 1988) is a Fre...,Wangechi Mutu (born 1972) is a Kenyan American...
4,Mary Catherine Silber is a professor in the De...,Kenneth Feingold (born 1952 in USA) is a conte...
...,...,...
95,Marilyn E. Strutchens (born 1962) is an Africa...,INO is a visual artist from Greece who studied...
96,Daniela Kühn (born 1973) is a German mathemati...,"Patrick Brill (born 1963), better known by hi..."
97,Marie Lynn Miranda (born 1962/1963) is an Amer...,"Peter Ford Young (born January 2, 1940) is an ..."
98,Jo Boaler (born 18 February 1964) is a British...,"Nancy Goldin (born September 12, 1953) is an A..."


In [12]:
data

{'Female_mathematicians': ["Éva Tardos (born 1 October 1957) is a Hungarian mathematician and the Jacob Gould Schurman Professor of Computer Science at Cornell University.\nTardos's research interest is algorithms. Her work focuses on the design and analysis of efficient methods for combinatorial optimization problems on graphs or networks. She has done some work on network flow algorithms like approximation algorithms for network flows, cut, and clustering problems. Her recent work focuses on algorithmic game theory and simple auctions.\n\n\n== Education and career ==\nTardos received her Dipl. Math in 1981 and her Ph.D. 1984 from the Faculty of Sciences of the Eötvös Loránd University under her advisor András Frank. She was the Chair of the Department of Computer Science at Cornell from 2006-2010, and she is currently serving as the Associate Dean of the College of Computing and Information Science.\nShe was editor-in-Chief of SIAM Journal on Computing from 2004–2009, and is currentl

In [13]:
facts

{'Female_mathematicians': {'Éva Tardos': ['Crawlspace_(2004_film)',
   'El_Dorado_Airport_(Venezuela)',
   'Peter_Sved',
   'Márta_Svéd',
   'Pál_Turán',
   'Category:Mathematicians_from_Budapest',
   'Hongkou_District',
   'Paul_Erdős',
   'Category:1910_births',
   'Category:2005_deaths',
   'Category:Hungarian_emigrants_to_Australia',
   'George_Szekeres',
   'Macquarie_University',
   'Combinatorics',
   'Mathematician',
   'Australia',
   'Australians',
   'Category:Macquarie_University_faculty',
   'Budapest',
   'Category:Australian_mathematicians',
   'Adelaide',
   'Happy_ending_problem',
   'Happy_Ending_problem',
   'Category:Hungarian_Jews',
   'Category:20th-century_Hungarian_mathematicians',
   'Category:20th-century_women_mathematicians',
   'Category:Scientists_from_Sydney',
   'China',
   'Sydney',
   'Kingdom_of_Hungary',
   'New_Zealand',
   'Shanghai',
   'World_War_II',
   'Márta_Svéd',
   'Hungarian_people',
   'Projective_geometry',
   'Inversive_geometry',
   'L