In [1]:
import rdflib
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx
import matplotlib.pyplot as plt

import requests
import json

In [27]:
import pandas as pd 
import networkx as nx
import nx_altair as nxa
import altair as alt
from tqdm.notebook import tqdm

In [21]:
ask_query = """ASK {
      wd:Q34201 p:P31 ?statement0.
      ?statement0 (ps:P31/(wdt:P279*)) wd:Q215627.
 }"""


endpoint_url = "https://query.wikidata.org/sparql"
headers = {'User-Agent': 'MyBot'}
payload = {
        'query': ask_query,
        'format': 'json'
    }
r = requests.get(endpoint_url, params=payload, headers=headers)

In [24]:
r.status_code #if status_code is 400 then error in formatting the request

200

In [26]:
r.text

'{\n  "head" : { },\n  "boolean" : true\n}'

In [31]:
r.json()['boolean']

True

# defining the classifyer
technique is different for human and non human 

In [1]:
def is_person(wiki_code, with_wd=False):
    """from the wikipedia code returns whether the entity is a person or not"""
    if with_wd:
        code = wiki_code
    else : code = "wd:"+wiki_code
    ask_query = "ASK {"+ code +" wdt:P31 wd:Q5.}"
    endpoint_url = "https://query.wikidata.org/sparql"
    headers = {'User-Agent': 'MyBot'}
    payload = {
            'query': ask_query,
            'format': 'json'
        }
    r = requests.get(endpoint_url, params=payload, headers=headers)
    return r.json()['boolean']

In [3]:
is_person("Q5383") #test sur david bowie

True

# identifying the interesting elements
has been done by finding the right terms
- person wd:Q215627
- geographic entity wd:Q27096213
- string wd:Q184754
- temporal entity wd:Q26907166
- group of humans wd:Q16334295
- wikipedia related
- other

## classifying the types of people

In [20]:
def is_instanceof_subclass(wiki_code_data, wiki_code_class):
    ask_query = "ASK {"+ wiki_code_data +" p:P31 ?statement0. ?statement0 (ps:P31/(wdt:P279*)) "+ wiki_code_class +".}"
    endpoint_url = "https://query.wikidata.org/sparql"
    headers = {'User-Agent': 'MyBot'}
    payload = {
            'query': ask_query,
            'format': 'json'
        }
    r = requests.get(endpoint_url, params=payload, headers=headers)
    return r.json()['boolean']

def is_geographic_entity(wiki_code_data, with_wd=False):
    if with_wd:
        code = wiki_code
    else : code = "wd:"+wiki_code
    return is_instanceof_subclass(code, "wd:Q27096213")


def is_temporal_entity(wiki_code_data, with_wd=False):
    if with_wd:
        code = wiki_code
    else : code = "wd:"+wiki_code
    return is_instanceof_subclass(code, "wd:Q26907166")

def is_group_of_human(wiki_code_data, with_wd=False):
    if with_wd:
        code = wiki_code
    else : code = "wd:"+wiki_code
    return is_instanceof_subclass(code, "wd:Q16334295")


### putting away the wikipedia and other links

In [25]:
def is_wikidata(iri):
    caract = "http://www.wikidata.org/"
    taille = len(caract)
    return iri[:taille] == caract

In [24]:
is_wikidata("https://www.wikipedia.org/wiki/Q16334295")

False

## Getting the wikipedia code
how to deal with iri like : https://www.wikidata.org/wiki/Q65052335#Q65052335$ABF63D2E-C066-4DE7-94C2-F846B66BCB37

In [None]:
def get_wiki_code(iri):
    #TODO
    # split to check if there is entity in the name 
    # get the part "Q..."
    return None 

## Applying it on several elements

In [29]:
class_value_to_type = ["not wikidata", "person", "geographic entity", "temporal entity", "group of humans", "other"]
def get_type_from_value(n):
    return class_value_to_type[n]

def create_classified_nodes(list_of_nodes):
    """ relations 
        0: not wikidata
        1: person 
        2 : geographic entity
        3 : temporal entity
        4 : group of human
        5 : other"""
    node_class = []
    for node in tqdm(list_of_nodes):
        value = 0
        if is_wikidata(node):
            #get the wikicode
            node_code = get_wiki_code(iri)
            found = False
            for num, funct in enumerate([is_person, is_geographic_entity, is_temporal_entity, is_group_of_human]):
                if funct(code_node):
                    value = num + 1
                    found = True
                    break
            if not found: 
                value = 5
        node_class.append(value)
    return node_class

# Testing on a smaller graph

In [39]:
PATH = "collecting_data_with_SPARQL/graph_to_queer_people_dist1.csv"
df = pd.read_csv(PATH)
df2 = df[:100]

### testing with only the human

In [40]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   subject    100 non-null    object 
 1   predicate  100 non-null    object 
 2   object     100 non-null    object 
 3   context    0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 3.2+ KB


# Testing 

In [4]:
file_path = "collecting_data_with_SPARQL/star_graph_gay_men.csv"
df = pd.read_csv(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58443 entries, 0 to 58442
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   subject    58443 non-null  object 
 1   predicate  58443 non-null  object 
 2   object     58443 non-null  object 
 3   context    0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 1.8+ MB


In [10]:
nxgraph = nx.from_pandas_edgelist(df, "subject", "object")

In [11]:
people_list = df["subject"].unique()
len(people_list)

475

In [12]:
object_list = df["object"].unique()
len(object_list)

37999

In [30]:
object_types = create_classified_nodes(object_list)

  0%|          | 0/37999 [00:00<?, ?it/s]

NameError: name 'get_wiki_code' is not defined

In [22]:
len(object_types)

37999

### easier version with only wiki data and person

In [31]:
object_types_simple = []
for o in tqdm(object_list):
    if not is_wikidata(o):
        object_types_simple.append("not wikidata")
    else :
        object_types_simple.append("wikidata entity")

  0%|          | 0/37999 [00:00<?, ?it/s]

In [32]:
people_list_type = ["human"] * len(people_list)

In [37]:
nodes_list = [*people_list, *object_list]

In [38]:
df_nodes = pd.DataFrame(data=nodes_list, columns=["node"])
df_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38474 entries, 0 to 38473
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node    38474 non-null  object
dtypes: object(1)
memory usage: 300.7+ KB


In [39]:
df_nodes["type"] = [*people_list_type, *object_types_simple]
df_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38474 entries, 0 to 38473
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node    38474 non-null  object
 1   type    38474 non-null  object
dtypes: object(2)
memory usage: 601.3+ KB


In [41]:
df_nodes.to_csv("collecting_data_with_SPARQL/graph_gay_men_nodes_simple.csv")

In [45]:
df_nodes["type"].value_counts()

wikidata entity    20093
not wikidata       17906
human                475
Name: type, dtype: int64