# Import Libraries

In [48]:
from qwikidata.sparql import return_sparql_query_results
from SPARQLWrapper import SPARQLWrapper, JSON
from pandas.io.parsers import ParserError
import pandas as pd
import sys
import requests
from requests.exceptions import SSLError

# Wikidata SPARQL Query Retrieval Functions

In [10]:
def Create_sparql_engine():
    """ 
    Step 1: create user and end point features
   
    user_agent = The User-Agent for the HTTP request header. 
                The default value is an autogenerated string using the SPARQLWrapper version code.
    end_point =  SPARQL endpoint’s URI.
    
    Step 2: crete the SPARQL 'engine' by calling the SPARQLwrapper on the end_point and user_agent
    """
    user_agent = "WDQS-example Python/%s.%s" % ( sys.version_info[0], sys.version_info[1])
    endpoint_url = "https://query.wikidata.org/sparql"
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    return sparql

def get_wikidata_query(engine,query_string):
    """ 
    Step 3: We call the engine from the previous function, as well as a string of the query we want. 
    You can construct this query using the Wikidata SPARQL Interface
    """
    # Call engine on query_string
    engine.setQuery(query_string)
    #return query results
    engine.setReturnFormat(JSON)
    #transform into a pandas data frame
    results_df = pd.io.json.json_normalize(engine.query().convert()['results']['bindings'])
    return results_df


# Execute Wikidata Functions for Results

In [37]:
# Query goes here! 
# Feel free to erase this query and put your own
query_string = """ SELECT ?television_program ?television_programLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?television_program wdt:P31 wd:Q15416.
}
LIMIT 100"""

# Create engine 
engine= Create_sparql_engine()
#execute query and get results
results = get_wikidata_query(engine, query_string)

# Wikimedia Rest API For Page Views

### Some Data Prep Guidelines

In [38]:
"""Now that we have our query results, we need to push them into the Wikimedia Page views API in order to get 
their views information. The logic goes like this: 
1) iterate through the items we retrieved. The original case is TV Programs, so I will iterate over each name of TV shows 
to pull their views
2) we are collecting the API results for each show in a list of json's
3) then we extract the items from each json and concatentate it into a dataframe in pandas

You will notice that there are try and except markers. This is because sometimes the information might not be available in the
data base either because the time range is not available for it, or the page views info isnt yet available for that show.
These instances are rare, but if you are pulling for alot of data, you might want to control for them when they come.

One opportunity for the script to fail is if it ingests a result that is a code for a wikipedia item, like: Q224123

Breifly, wikipedia has unique code ID's for everything in the database. Sometimes a query won't return a human readable item, 
but instead the wikicode. When this happens, it's likely that that specific wikidata page has not been sufficiently filled out, 
so no information is retrievable from it. you can read more about those codes here: 
        https://www.wikidata.org/wiki/Wikidata:Glossary
        
 If you type results.columns you will notice that you have some additional columns.Columns like television_program.type, 
for example. These are columns that describe the type of data for each row. I found them relatively useless, so I drop them
by selecting only the columns I want

After that, I am renaming the columns I retrieved since they are not cute.
"""

# Selecting columns
results = results[['television_programLabel.value']]
# Renaming them 
results.columns = ['tv_show_name']

""" Next, if you refer to the article I wrote, the format for inputting these wikipages into the API is with a _ in between 
each natural word spacing. So we need to add that, then we can push it to the API"""

results.tv_show_name = results.tv_show_name.str.replace(' ','_',regex=True)

In [47]:
# Get list a pages
list_of_pages = list(results.tv_show_name.unique())

# function will accept the list of pages, and pull the pages views for them, resulting in a pandas dataframe
def pull_pageviews_from_wikimedia(list_of_pages):
    # empty list for all the page views data
    dataframes = []
    # iterate over all the pages in the list
    for i in list_of_pages:
        try:
            # use requests to call on Wikimedia Server
            index = requests.get("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/{}/daily/20200601/20200615"\
                                  .format(i))
            '''notice that in the api call  I left a {} where the page name would go. I use the .format() function to input the name
            of the show into the api call as the script iterates through each. You can use this logic to input other parts of the 
            URL as you want. Even manipulating the end date to be today, inputting different project language values instead of en. 
            you can really explore a lot with this API. I suggest messing around with it'''
            # collect the the json from the index server call. (similar to .text() if you are used to webscraping.)
            result = index.json()
        # if we hit an error, ignore and keep going since it is likely due to what I mentioned in the above cell.
        except SSLError:
            continue
        try:
            dataframes.append(pd.DataFrame(result['items']))
        except KeyError:
            pass
    # once we are done iterating, we take all the jsons stored in the dataframes list and concat it to pandas df. 
    df = pd.concat(dataframes, axis=0, ignore_index=True)
    return df

In [51]:
# et voila! We have views henny :D 
page_views_df = pull_pageviews_from_wikimedia(list_of_pages)

# Conclusion

Once you have the resulting page views data frame, you can use that as you wish. In my own project, since I had pulled a lot of other columns from Wikidata SPARQL, I had to join the page views back to my SPARQL dataframe so that to have all the information at once. 