In [1]:
#Goal is to extract data using the RTS archieve API

import os
import json
import http.client
import numpy as np

# path to the datasets
BASE_PATH ='data/RTS_dataset'
AUTHORIZATION = 'WWNNR3l4Wmh0UnY3bDNIc3R2QkhyTVo3eFVHWUVGYzE6MmplUjNpVHRranNNM2ZsWA=='
RESOURCE_URL = "https://api.srgssr.ch/rts-archives/v3/broadcasts"

In [2]:

def get_data(payload):
    
    headers = {
    'Authorization': AUTHORIZATION,
    'Cache-Control': 'no-cache',
    'Content-Length': '0',
    'Postman-Token': '24264e32-2de0-f1e3-f3f8-eab014bb6d76'
    }
    
    while True:
        conn = http.client.HTTPSConnection("api.srgssr.ch")
        url = "https://api.srgssr.ch/oauth/v1/accesstoken?grant_type=client_credentials"
        conn.request("POST",url, "", headers)
        res = conn.getresponse()
        data = res.read()
        decode_data = json.loads(data.decode("utf-8"))
        payload['Authorization'] = "Bearer " + decode_data['access_token']

        url = "/rts-archives/v3/broadcasts/?query={}&minPublicationDate={}&maxPublicationDate={}&start={}".format(
                                                                                                        payload['query'],
                                                                                                        payload['maxPublicationDate'],
                                                                                                        payload['minPublicationDate'],
                                                                                                        payload['start'])

        conn.request("GET", url, "", payload)
        res = conn.getresponse()
        data = res.read()
        decode_data = json.loads(data.decode("utf-8"))
        if 'code' not in decode_data:
            break
    
    return decode_data, payload


def build_dict(query = '', minPublicationDate = '', maxPublicationDate = '',minDurationSec = '',
               maxDurationSec = '', mediaTypes = '', enumeratedFacets = '', publicationDateIntervalFacets = '',
               durationSecIntervalFacets = '', start = '', rows=''):
    
    dict_ = {
             'accept': "application/json",
             'query' : query,
             'minPublicationDate' : minPublicationDate,
             'maxPublicationDate' : maxPublicationDate,
             'minDurationSec' : minDurationSec,
             'maxDurationSec' : maxDurationSec,
             'mediaTypes' : mediaTypes,
             'enumeratedFacets' : enumeratedFacets,
             'publicationDateIntervalFacets': publicationDateIntervalFacets,
             'durationSecIntervalFacets' : durationSecIntervalFacets,
             'start' : '0',
             'Postman-Token' : '56128353-805e-4974-6689-5ef6d86e2d80',
             'rows' : rows}

    return {k: v for k,v in dict_.items() if v}

def request_data_and_write(payload, path=''):

    data, payload = get_data(payload)
    number_documents = data['meta']['count']
        
    if number_documents <= 0:
        return
    number_rows = 25
    
    print("number of documents:{}".format(number_documents))
    
    if 'rows' in payload and payload['rows'] > 0:
        number_rows = payload['rows']
        
    number_requests = (number_documents + number_rows // 2) // number_rows
    
    if not os.path.exists(BASE_PATH + path):
        os.makedirs(BASE_PATH + path)

    with open(BASE_PATH + path + 'data_0.json', 'w') as outfile:
        json.dump(data, outfile)
    
    for n in range(1,number_requests):
        
        #Update payload
        payload['start'] = n * number_rows

        data = get_data(payload)[0]
        
        with open(BASE_PATH + path + 'data_{}.json'.format(n), 'w') as outfile:
            json.dump(data, outfile)
            
def request_data_per_year_and_party(year, party):
    print('requesting for year:{} and party:{}'.format(year, party))
    payload = build_dict(minPublicationDate='{}'.format(year), maxPublicationDate='{}'.format(year), query ='{}'.format(party))
    request_data_and_write(payload, '_per_year/{}/{}/'.format(year, party))
    

parties = ['UDC', 'PDC', 'PS', 'PLR', 'PES', 'PVL']
years = np.arange(2012, 2020)   
for year in years:
    for party in parties:
        request_data_per_year_and_party(year, party)

requesting for year:2012 and party:UDC
number of documents:325
requesting for year:2012 and party:PDC
number of documents:213
requesting for year:2012 and party:PS
number of documents:217
requesting for year:2012 and party:PLR
number of documents:280
requesting for year:2012 and party:PES
number of documents:5
requesting for year:2012 and party:PVL
requesting for year:2013 and party:UDC
number of documents:287
requesting for year:2013 and party:PDC
number of documents:199
requesting for year:2013 and party:PS
number of documents:158
requesting for year:2013 and party:PLR
number of documents:257
requesting for year:2013 and party:PES
number of documents:6
requesting for year:2013 and party:PVL
number of documents:3
requesting for year:2014 and party:UDC
number of documents:213
requesting for year:2014 and party:PDC
number of documents:104
requesting for year:2014 and party:PS
number of documents:67
requesting for year:2014 and party:PLR
number of documents:137
requesting for year:2014 a