# RTS Data Extraction

In [2]:
import os
import json
import http.client
import numpy as np

# path to the datasets
BASE_PATH ='data/RTS_dataset'
AUTHORIZATION = 'WWNNR3l4Wmh0UnY3bDNIc3R2QkhyTVo3eFVHWUVGYzE6MmplUjNpVHRranNNM2ZsWA=='
RESOURCE_URL = "https://api.srgssr.ch/rts-archives/v3/broadcasts"

The RTS API uses an OAuth security to eliminate the need for a user to disclose their user id and secret. The OAuth service acts as an inbetween between the user and resource server and grants permission with a temporary access token that must be used in the same session as the authentication took place.  

In [5]:

def get_data(payload):
    
    headers = {
    'Authorization': AUTHORIZATION,
    'Cache-Control': 'no-cache',
    'Content-Length': '0',
    'Postman-Token': '24264e32-2de0-f1e3-f3f8-eab014bb6d76'
    }
    
    while True:
        conn = http.client.HTTPSConnection("api.srgssr.ch")
        url = "https://api.srgssr.ch/oauth/v1/accesstoken?grant_type=client_credentials"
        conn.request("POST",url, "", headers)
        res = conn.getresponse()
        data = res.read()
        decode_data = json.loads(data.decode("utf-8"))
        payload['Authorization'] = "Bearer " + decode_data['access_token']

        url = "/rts-archives/v3/broadcasts/?query={}&minPublicationDate={}&maxPublicationDate={}&start={}".format(
                                                                                                        payload['query'],
                                                                                                        payload['maxPublicationDate'],
                                                                                                        payload['minPublicationDate'],
                                                                                                        payload['start'])

        conn.request("GET", url, "", payload)
        res = conn.getresponse()
        data = res.read()
        decode_data = json.loads(data.decode("utf-8"))
        if 'code' not in decode_data:
            break
    
    return decode_data, payload


def build_dict(query = '', minPublicationDate = '', maxPublicationDate = '',minDurationSec = '',
               maxDurationSec = '', mediaTypes = '', enumeratedFacets = '', publicationDateIntervalFacets = '',
               durationSecIntervalFacets = '', start = '', rows=''):
    
    dict_ = {
             'accept': "application/json",
             'query' : query,
             'minPublicationDate' : minPublicationDate,
             'maxPublicationDate' : maxPublicationDate,
             'minDurationSec' : minDurationSec,
             'maxDurationSec' : maxDurationSec,
             'mediaTypes' : mediaTypes,
             'enumeratedFacets' : enumeratedFacets,
             'publicationDateIntervalFacets': publicationDateIntervalFacets,
             'durationSecIntervalFacets' : durationSecIntervalFacets,
             'start' : '0',
             'Postman-Token' : '56128353-805e-4974-6689-5ef6d86e2d80',
             'rows' : rows}

    return {k: v for k,v in dict_.items() if v}

def request_data_and_write(payload, path=''):

    data, payload = get_data(payload)
    number_documents = data['meta']['count']
        
    if number_documents <= 0:
        return
    number_rows = 25
    
    print("number of documents:{}".format(number_documents))
    
    if 'rows' in payload and payload['rows'] > 0:
        number_rows = payload['rows']
        
    number_requests = (number_documents + number_rows // 2) // number_rows
    
    if not os.path.exists(BASE_PATH + path):
        os.makedirs(BASE_PATH + path)

    with open(BASE_PATH + path + 'data_0.json', 'w') as outfile:
        json.dump(data, outfile)
    
    for n in range(1,number_requests):
        
        #Update payload
        payload['start'] = n * number_rows

        data = get_data(payload)[0]
        
        with open(BASE_PATH + path + 'data_{}.json'.format(n), 'w') as outfile:
            json.dump(data, outfile)
            
def request_data_per_year_and_party(year, query):
    print('requesting for year:{} and party:{}'.format(year, query))
    payload = build_dict(minPublicationDate='{}'.format(year), maxPublicationDate='{}'.format(year), query ='{}'.format(query))
    request_data_and_write(payload, '_per_year/{}/{}/'.format(year, query))
    

queries = ['UDC', 'PDC', 'PS', 'PLR', 'PES', 'PVL']

#After the initial run, it was difficult to find PES and PVL data as they are normally referred to as "les Verts" or "les Verts libéraux"
#So all data was gathered for the query "Verts" and will later be filtered for irrelevant data, PES or PVL
queries.append('Verts')
queries.append('Politique')
years = np.arange(2012, 2020)   
for year in years:
    for query in queries:
        request_data_per_year_and_party(year, query)

requesting for year:2012 and party:UDC
number of documents:325
requesting for year:2012 and party:PDC
number of documents:213
requesting for year:2012 and party:PS
number of documents:217
requesting for year:2012 and party:PLR
number of documents:280
requesting for year:2012 and party:PES
number of documents:5
requesting for year:2012 and party:PVL
requesting for year:2012 and party:Verts
number of documents:210
requesting for year:2012 and party:Politique
number of documents:1045
requesting for year:2013 and party:UDC
number of documents:287
requesting for year:2013 and party:PDC
number of documents:199
requesting for year:2013 and party:PS
number of documents:158
requesting for year:2013 and party:PLR
number of documents:257
requesting for year:2013 and party:PES
number of documents:6
requesting for year:2013 and party:PVL
number of documents:3
requesting for year:2013 and party:Verts
number of documents:179
requesting for year:2013 and party:Politique
number of documents:969
request

For each year between 2012 and 2019 and for each of the parties listed the RTS archives API was queried for any items that were relevant to the party names, including the generic "verts" which would be used later to add to the lists of PES and PVL as it was noted that RTS does not normally refer directly to the party abbreviations. 

These queries were returned in batches of 25 documents that were then stored in the RTS_dataset_per_year folder by year and then query.