# Part 1.1

## Libraries to import/install

In [3]:
!pip install wikipedia
!pip install rdflib
!pip install SPARQLWrapper

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=6b331ba10b2397de137b9ac6a714f665003426b945281e93d610512971e43b48
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [4]:
import random
import pandas as pd
import re
import json
import requests
from bs4 import BeautifulSoup
ua = {'User-agent': 'Mozilla/5.0'}

In [7]:
import wikipedia as wkp
import os
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, FOAF
from pprint import pprint
from SPARQLWrapper import SPARQLWrapper, JSON

## Data collection function

### Auxiliary functions for text

In [8]:
def get_page_names(url, se_index) :
  """Gets all page names for pages about a category of people"""
  page = requests.get(url, headers=ua)
  soup = BeautifulSoup(page.content, 'html.parser')

  #Getting all the information on other wikipedia pages refered to on this page
  links = soup.find_all(href=re.compile("/wiki/"))

  #Getting the names of these pages
  names = []
  for link in links :
    names.append(link.get('title'))

  #Selecting only the ones about the people of interest (already grouped together and ordered alphabetically on the page)
  s_index = names.index(se_index[0])
  e_index = names.index(se_index[1])
  people = names[s_index:e_index+1]

  #final cleaning of the list(deleting links attached to figure captions that are in the middle of the list)
  figcaption = soup.find_all('figcaption')
  unwanted = []
  title = re.compile(r'(?<=title=").+?(?=")')
  for caption in figcaption :
    unwanted = unwanted + title.findall(str(caption))
  unwanted = list(set(unwanted))
  for i in range(len(people)) :
    if people[i] in unwanted :
      people[i] = None
  while None in people :
    people.remove(None)

  return people


def hundred(names) :
  random.shuffle(names)
  names100 = names[:100]
  return names100


def get_text(names) :
  texts = []
  for name in names :
    wpage = wkp.WikipediaPage(title=name)
    texts.append(wpage.content)
  return texts


#Creating the file names
def create_text_files(names100, data, category, output_dir) :
  file_names = []
  for name in names100 :
    file_names.append(name.replace(' ', '_'))

  category_dir = os.path.join(output_dir)
  os.makedirs(category_dir, exist_ok=True)

  for j in range(len(file_names)) :
    output_file = os.path.join(category_dir, f"{file_names[j]}.txt")
    with open(output_file, 'w') as f:
      f.write(data[category][j])

### Auxiliary functions for graphs

In [9]:
def get_rdf_triples(name):
    sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    encoded_name = name.replace(' ', '_')

    query = f"""
    SELECT DISTINCT ?subject ?predicate ?object
    WHERE {{
      {{
        ?subject ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
      UNION
      {{
        ?object ?predicate dbr:{encoded_name} .
        ?subject ?predicate ?object .
      }}
    }}
    LIMIT 100
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    triples = []
    for result in results["results"]["bindings"]:
        predicate = result["predicate"]["value"]
        obj = result["object"]["value"]
        triples.append({'subject': f"http://dbpedia.org/resource/{encoded_name}", 'predicate': predicate, 'object': obj})
    return triples

In [10]:
#Function using get_rdf_triples
def save_rdf_triples(list_names, output_dir):
    category_dir = os.path.join(output_dir)
    os.makedirs(category_dir, exist_ok=True)

    for name in list_names:
        try:
            triples = get_rdf_triples(name)

            if triples:
                output_file = os.path.join(category_dir, f"{name}.json")
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(triples, f, indent=4)
            else:
                print(f"No RDF triples found for {name}")

        except Exception as e:
            print(f"Error processing {name}: {e}")

In [11]:
#Function to get all the facts from the objects of the graphs

def get_facts(rdf_directory, name_list):
    directory_path = fr"{os.getcwd()}\{rdf_directory}"

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Load the content of the JSON file
            content = json.load(file)              #iterate inside json files and load content

        list_obj = []
        for dicts in content:    # iterate inside the file (list of dicts)
            for obj in dicts:    # iterate inside the dicts of one file
                keys = list(dicts.keys())    #obtain list of the keys of a dict
                object_url = dicts[keys[2]]  #get the third value of a dict (the object)
                list_obj.append(object_url)  #put all objets in a list

        list_facts = [o.split('/')[-1] for o in list_obj]  #iterate inside the list of object to put the facts in a new list

    for name in name_list:
        facts[f"{name}"] = list_facts

    return facts

### General main function

In [12]:
text_extraction = {"wikilink" : ["https://en.wikipedia.org/wiki/List_of_women_in_mathematics", "https://en.wikipedia.org/wiki/List_of_contemporary_artists"],
                   "category" : ["Female_mathematicians", "Contemporary_artists"],
                   "index" : [('Karen Aardal', 'Magdolna Zimányi'), ('Martine Aballéa', 'Ricardo Estanislao Zulueta')],
                   "dir_name" : ['Female_mathematicians_txt', 'Contemporary_artists_txt']}

graph_extraction = {"dbpedia_link" : ['https://dbpedia.org/describe/?url=http%3A%2F%2Fdbpedia.org%2Fresource%2FList_of_women_in_mathematics&sid=74483', 'https://dbpedia.org/page/List_of_contemporary_artists'],
                    "dir_name" : ['Female_mathematicians_rdf', 'Contemporary_artists_rdf']}

In [16]:
#FINAL FUNCTION REGROUPING EVERYTHING

def text_graph_collection(text_extraction, graph_extraction) :

  #Text extraction

  data = {}
  names100 = []

  for i in range(len(text_extraction["wikilink"])) :
    page_names = get_page_names(text_extraction["wikilink"][i], text_extraction["index"][i])

    #random selecting of 100 links (to help with execution time and storage)
    names100.append(hundred(page_names))

    #Getting the text content of the pages
    data[text_extraction["category"][i]] = get_text(names100[i])

    #Storing the text files
    create_text_files(names100[i], data, text_extraction["category"][i], text_extraction["dir_name"][i])

    #Storing all the raw texts into one json file
    with open("./text_data.json", "w") as f:
      json.dump(data, f)

  #Creating the dataframe
  df = pd.DataFrame(data)


  #Graph extraction

  g = rdflib.Graph()
  facts = {}

  for i in range(len(graph_extraction["dbpedia_link"])) :

    #Building the graphs
    g.parse(graph_extraction["dbpedia_link"][i])

    #Storing the triples
    cleaned_names = [re.sub(r'\s*\(.*?\)', '', x) for x in names100[i]]
    save_rdf_triples(cleaned_names, graph_extraction["dir_name"][i])

    #Saving the facts into a dictionary
    name = text_extraction["category"][i]
    facts[f"{name}"] = get_facts(graph_extraction["dir_name"][i], cleaned_names)


  return data, df, facts

In [17]:
data, df, facts = text_graph_collection(text_extraction, graph_extraction)

No RDF triples found for Lauren Lynn Rose
No RDF triples found for Malena Español
No RDF triples found for Tatiana Roque
No RDF triples found for Anne Lemaître
No RDF triples found for Heisook Lee


FileNotFoundError: [Errno 2] No such file or directory: '/content\\Female_mathematicians_rdf'

In [None]:
df

Unnamed: 0,Female_mathematicians,Contemporary_artists
0,Concha Maria Gómez is an American mathematicia...,"Norman K. Carlberg (November 6, 1928 – Novembe..."
1,Darinka Dentcheva (Bulgarian: Даринка Денчева...,Samantha Louise Taylor-Johnson (née Taylor-Wo...
2,Dorothy Ann Eckels Bailie (born 1935) is an Am...,Nam June Paik (Korean: 백남준; RR: Baek Nam-jun; ...
3,Tamara Ann Broderick is an American computer s...,"Lawrence ""Larry"" Zox (May 31, 1937 – December ..."
4,Anna Catherine Gilbert (born 1972) is an Ameri...,"Don Gummer (born December 12, 1946) is an Amer..."
...,...,...
95,Johanna Weber (8 August 1910 – 24 October 2014...,John Duncan is an American multi-platform arti...
96,Julia Elisenda (Eli) Grigsby is an American ma...,Adrian Margaret Smith Piper (born September 20...
97,Venkatramani Lakshmibai (1944/1945 – 2 Decembe...,Sir Isaac Julien (born 21 February 1960) is a...
98,Karen I. Aardal (born 1961) is a Norwegian and...,"XCOPY is an anonymous, London-based digital ar..."


In [None]:
data

{'Female_mathematicians': ["Carolyn A. Maher is the Distinguished Professor of Mathematics Education and Director of the Robert B. Davis Institute for Learning. She received the 2022 National Council of Teachers of Mathematics (NCTM) Lifetime Achievement Award.\n\n\n== Early life and education ==\nMaher received an Ed.D. in Mathematics Education (1972), M.Ed. in Education (1965) and B.A. (1962) from Rutgers University with a major in Mathematics Education and a minor in Statistics. \n\n\n== Career and research ==\nMaher worked as an elementary school teacher from 1962-1967 in the Matawan Regional School District, Augusta, and Scotch Plains. In 1992 she became a professor of mathematics education at Rutgers University and became the Distinguished Professor of Mathematics Education in 2007. Her work focuses on different studies of student’s mathematical reasoning and argumentation. Her work was inspired by her prior experience as a teacher and she wanted to understand how students learn 

In [None]:
facts