In [None]:
import pandas as pd
import numpy as np
import altair as alt
alt.renderers.enable("mimetype")
alt.data_transformers.enable('default', max_rows=None)
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
inferred_wiki_global_txt_file = pd.read_csv("../data/original_journal_datasets/jstor/cleaned_jstor_titles_inferred_wiki.csv")

- get wikidata jstor_id with all properties
- get wikidata values for properties with URI
- get sitelinks from wikidata
- get wikipedia page for sitelinks
- scrape wikipedia page for publication in multiple languages https://github.com/martin-majlis/Wikipedia-API

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import wikidata
import pandas as pd

In [None]:
# Function to create a SPARQL query for a batch of JSTOR IDs
def create_batch_query(jstor_ids):
    # Create a VALUES clause with all the JSTOR IDs
    values_clause = ' '.join(f'("{jstor_id}")' for jstor_id in jstor_ids)
    sparql_query = f"""
    SELECT ?jstorID ?item ?itemLabel ?article ?publicationInterval WHERE {{
      VALUES (?jstorID) {{ {values_clause} }}
      ?item wdt:P1230 ?jstorID.
      OPTIONAL {{
        ?item wdt:P2896 ?publicationInterval.
        ?article schema:about ?item;
                 schema:isPartOf <https://en.wikipedia.org/>.
      }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    return sparql_query

In [None]:
jstor_ids = ["amerhistrevi"]
query = create_batch_query(jstor_ids)

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [None]:
from wikidata.client import Client

def fetch_wikidata(qid):
    client = Client()
    entity = client.get(qid, load=True)
    return entity

test_entity = fetch_wikidata('Q389936')

In [None]:
pd.json_normalize(test_entity.data['sitelinks'])

In [None]:
def get_sitelinks(entity):
    sitelinks = entity.data['sitelinks']
    sitelinks_df = pd.DataFrame(sitelinks).T
    return sitelinks_df

In [None]:
def create_batch_query(jstor_ids, property_ids):
    # Create a VALUES clause with all the JSTOR IDs
    values_clause = ' '.join(f'("{jstor_id}")' for jstor_id in jstor_ids)
    # Create a part of the WHERE clause for each property ID
    where_clause = ' '.join(f'OPTIONAL {{ ?item wdt:{property_id} ?{property_id} . }}' for property_id in property_ids)
    sparql_query = f"""
    SELECT ?jstorID ?item ?itemLabel { ' '.join(f'?{property_id}' for property_id in property_ids) } ?article WHERE {{
      VALUES (?jstorID) {{ {values_clause} }}
      ?item wdt:P1230 ?jstorID.
      {where_clause}
      OPTIONAL {{
        ?article schema:about ?item;
                 schema:isPartOf <https://en.wikipedia.org/>.
      }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    return sparql_query

In [None]:
wiki_data_fields_df = pd.read_csv("../data/original_journal_datasets/wiki/wikidata_fields.csv")
properties = wiki_data_fields_df['property_id'].tolist()
sparql_query = create_batch_query(['amerhistrevi'], properties)

In [None]:
import requests
import time
def query_wikidata(sparql_query):
    url = 'https://query.wikidata.org/sparql'
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; YourTool/0.1; +http://YourWebSite.com/Bot)',
        'Accept': 'application/sparql-results+json'
    }
    try:
        response = requests.get(url, headers=headers, params={'query': sparql_query})
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.json()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error: {err}")
        time.sleep(10)  # Wait 10 seconds before retrying
        return query_wikidata(sparql_query)  # Retry the query
    except requests.exceptions.RequestException as err:
        print(f"Request error: {err}")
        time.sleep(10)  # Wait 10 seconds before retrying
        return None

In [None]:
results = query_wikidata(sparql_query)

In [None]:
test_results = pd.json_normalize(results['results']['bindings'])
columns_with_more_than_one_unique_value = test_results.columns[test_results.nunique() > 1].tolist()
test_results[columns_with_more_than_one_unique_value]

In [None]:
type_columns = test_results.columns.tolist()
type_columns = [x for x in type_columns if '.type' in x]

uri_columns = []
for column in type_columns:
    rows = test_results[test_results[column] == "uri"]
    if len(rows) > 0:
        uri_columns.append(column)
uri_columns
value_columns = [x.replace('.type', '.value') for x in uri_columns]

In [None]:
test_results[uri_columns + value_columns].drop_duplicates()