# RTS Data Preprocessing

In [26]:
import json
import tokenize
import numpy as np
import pandas as pd
import os
from itertools import islice

BASE_PATH ='data/RTS_dataset_per_year/{}/{}'
parties = ['UDC', 'PDC', 'PS', 'PLR', 'PES', 'PVL']
YEARS = np.arange(2012, 2020)

In [3]:
#Next, we want to go year by year and find interviews that are with interviews of political parties
    
def read_in(party):
    party_dict =[]
    
    for year in YEARS:
        if os.path.exists(BASE_PATH.format(year, party)):
            files = [f for f in os.listdir(BASE_PATH.format(year, party))]
            for file in files:
                with open(BASE_PATH.format(year, party)+"/{}".format(file), 'r') as fp:
                    dicts = json.load(fp)
                    for entry in dicts['data']:
                        if 'genres' in entry and 'speechToTexts' in entry:
                            if 'Interview' in entry['genres']:
                                party_dict.append(entry)
    return party_dict

#For each party, convert the json into dictionaries

UDC_dictionary = read_in('UDC')
PDC_dictionary = read_in('PDC')
PS_dictionary = read_in('PS')
PLR_dictionary = read_in('PLR')
PES_dictionary = read_in('PES')
PVL_dictionary = read_in('PVL')

Above, the json created by querying the RTS archive API for each of the party abbreviations was created into lists of dictionaries. However, it was noted that RTS does not often use the abbreviations for the PES and PVL parties and prefers to use the Verts and Verts libéreaux. Another query was made using the key word "verts" to find more broadcasts for these parties. Below these broadcasts were sorted into either verts or verts libéraux

In [4]:
#Below looks and sees if libéraux is in the title of an entry, if it is then the entry is sorted to the PVL dictionary

for year in YEARS:
    if os.path.exists(BASE_PATH.format(year, 'verts')):
            files = [f for f in os.listdir(BASE_PATH.format(year, 'verts'))]
            for file in files:
                with open(BASE_PATH.format(year, 'verts')+"/{}".format(file), 'r') as fp:
                    dicts = json.load(fp)
                    for entry in dicts['data']:
                        if 'title' in entry and 'genres' in entry and 'speechToTexts' in entry:
                            if 'Interview' in entry['genres']:
                                if 'libéraux' in entry['title']:
                                    PVL_dictionary.append(entry)
                                else:
                                    PES_dictionary.append(entry)


SyntaxError: invalid syntax (<ipython-input-4-e1d0134a780e>, line 11)

After reading in the lists of dictionaries, each dictionary is converted to a pandas dataframe. During on our analysis we will look at either the "speechToTexts" or "summary" collumns in order to do some NLP to determine what the key issues each party were discussing and potentially their sentiment about these topics.

In [5]:
UDC_pandas = pd.DataFrame.from_dict(UDC_dictionary)
PDC_pandas = pd.DataFrame.from_dict(PDC_dictionary)
PS_pandas = pd.DataFrame.from_dict(PS_dictionary)
PLR_pandas = pd.DataFrame.from_dict(PLR_dictionary)
PES_pandas = pd.DataFrame.from_dict(PES_dictionary)
PVL_pandas = pd.DataFrame.from_dict(PVL_dictionary)

This next part is for processing the SpeechtoText portions of the interviews, first we analyse what percentage of our desired groups in fact have the SpeechtoText feature

In [10]:
party_pandas = [UDC_pandas, PDC_pandas, PS_pandas, PLR_pandas, PES_pandas, PVL_pandas]

In [21]:
year = UDC_pandas["publicationDate"].str[:4]

In [23]:
UDC_pandas['year'] = year


Unnamed: 0,id,internalArchiveId,source,mediaType,title,publicationDate,summary,durationSec,genres,publicationChannelNames,...,program,contributors,remarks,mediaURL,imageURL,alternativeTitle,alternativeSummary,variantIds,webPublicationDate,year
0,sound-F8545377-80CC-49E4-9DFE-AA908CEC7079,F8545377-80CC-49E4-9DFE-AA908CEC7079,siranau,audio,"En Suisse, l'UDC maintient la pression concern...",2012-12-28T00:00:00Z,L'UDC a déposé sa deuxième initiative sur le s...,111,[Interview],[La Première],...,{'href': '/programs/D2A056F3-7C48-4C24-8B59-E5...,"[{'function': 'Intervieweur/euse', 'name': 'Sc...",,,,,,,,2012
1,sound-689E65AE-DBF9-4802-91BD-389B8B6B5E9A,689E65AE-DBF9-4802-91BD-389B8B6B5E9A,siranau,audio,Le Conseil Fédéral dit oui à l'extension de la...,2012-12-07T00:00:00Z,"L’initiative populaire de l’UDC ""contre l’immi...",877,[Interview],[La Première],...,{'href': '/programs/B525B1B7-2D98-4D8B-BED4-B5...,"[{'function': 'Interviewé/e', 'name': 'Freysin...",,,,,,,,2012
2,sound-C28F8DFC-7997-4FEE-A02C-6B7F2444D370,C28F8DFC-7997-4FEE-A02C-6B7F2444D370,siranau,audio,Ueli Maurer a été élu président de la Confédér...,2012-12-05T00:00:00Z,Ueli Maurer sera bel et bien président de la C...,193,"[Commentaire, Interview]",[La Première],...,{'href': '/programs/D2A056F3-7C48-4C24-8B59-E5...,"[{'function': 'Interviewé/e', 'name': 'Leuenbe...",,,,,,,,2012
3,sound-A168F9AC-3251-4553-94CB-BED8D074021A,A168F9AC-3251-4553-94CB-BED8D074021A,siranau,audio,Christophe Darbellay propose que l'on revote s...,2012-11-28T00:00:00Z,Le Tages Anzeiger online révèle que le préside...,636,"[Commentaire, Interview]",[La Première],...,{'href': '/programs/B525B1B7-2D98-4D8B-BED4-B5...,"[{'function': 'Présentateur/trice', 'name': 'H...",,,,,,,,2012
4,sound-3407BA82-67A7-48FA-8823-12BF40ACB4B5,3407BA82-67A7-48FA-8823-12BF40ACB4B5,siranau,audio,La loi sur les épizooties a été acceptée par l...,2012-11-25T00:00:00Z,La Confédération pourra mieux coordonner les e...,890,"[Commentaire, Interview]",[La Première],...,{'href': '/programs/B525B1B7-2D98-4D8B-BED4-B5...,"[{'function': 'Interviewé/e', 'name': 'Darbell...",,,,,,,,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,sound-DB9FF925-C1B9-40CD-808F-C56CC2EFD5F8,DB9FF925-C1B9-40CD-808F-C56CC2EFD5F8,siranau,audio,"Le 17 mars 2019, les vaudois élisent le ou la ...",2019-01-10T00:00:00Z,Le POP a lancé Anaïs Timofte dans la course à ...,417,[Interview],[La Première],...,{'href': '/programs/B525B1B7-2D98-4D8B-BED4-B5...,"[{'function': 'Intervieweur/euse', 'name': 'Bu...",CATINFO_JAN2019,//www.rts.ch/play/radio/redirect/detail/10110121,http://www.rts.ch/10549432.image,[Succession de Pierre-Yves Maillard: interview...,[Le POP a lancé Anaïs Timofte dans la course à...,[ece-10110121],2019-01-10T17:59:04Z,2019
461,sound-E18D4E51-8E05-47BC-AC58-C36236504F08,E18D4E51-8E05-47BC-AC58-C36236504F08,siranau,audio,Christian Levrat tire un bilan décevant de la ...,2019-01-07T00:00:00Z,"Christian Levrat, président du PS suisse\r\n""C...",116,"[Commentaire, Interview]",[La Première],...,{'href': '/programs/4A9A4C9A-B7B5-4EBD-BB17-E3...,"[{'function': 'Intervieweur/euse', 'name': 'Ba...",CATINFO_JAN2019,//www.rts.ch/play/radio/redirect/detail/10105555,http://www.rts.ch/7817196.image,[Christian Levrat tire un bilan décevant des t...,"[""Cette législature a été une débâcle"", selon ...",[ece-10105555],2019-01-07T11:41:01Z,2019
462,sound-F9D2CC92-E425-46DD-880E-129F4734D4CA,F9D2CC92-E425-46DD-880E-129F4734D4CA,siranau,audio,Faut-il interdire le cumul des mandats politiq...,2019-10-10T00:00:00Z,"dans Forum, on va débattre ce soir des cumular...",1066,"[Débat, Interview]",[La Première],...,{'href': '/programs/B525B1B7-2D98-4D8B-BED4-B5...,"[{'function': 'Intervieweur/euse', 'name': 'Ma...",CATINFO_OCT2019,//www.rts.ch/play/radio/redirect/detail/10754676,http://www.rts.ch/10549432.image,[Le débat - Faut-il interdire le cumul des man...,"[Débat entre Théo Bregnard, député (POP/NE) et...",[ece-10754676],2019-10-10T17:01:00Z,2019
463,sound-298F7630-A511-4357-BB45-3BC9D5D2D0BF,298F7630-A511-4357-BB45-3BC9D5D2D0BF,siranau,audio,"Interview de Marc Bühlmann, politologue à l'Un...",2019-10-05T00:00:00Z,"Le livre : « Konkordanz im Parlament», paru t...",342,[Interview],[La Première],...,{'href': '/programs/B525B1B7-2D98-4D8B-BED4-B5...,"[{'function': 'Intervieweur/euse', 'name': 'Gu...",CATINFO_OCT2019,//www.rts.ch/play/radio/redirect/detail/10742882,http://www.rts.ch/10549432.image,"[La concordance politique, le cœur de l'équili...","[Interview de Marc Bühlmann, politologue à l'U...",[ece-10742882],2019-10-05T16:33:00Z,2019


In [7]:
print(list(UDC_pandas))

['id', 'internalArchiveId', 'source', 'mediaType', 'title', 'publicationDate', 'summary', 'durationSec', 'genres', 'publicationChannelNames', 'thematicGeographicals', 'thematicThemes', 'thematicPersons', 'isOnline', 'lastModifiedDate', 'speechToTexts', 'program', 'contributors', 'remarks', 'mediaURL', 'imageURL', 'alternativeTitle', 'alternativeSummary', 'variantIds', 'webPublicationDate']


We can see from above that the potentially useful columns to analyze the broadcasts would be the summary and speechToTexts. At first we will focus on the speechToTexts

In [14]:
for party in party_pandas:
    len(party)

Now we want to look at the words used in the interviews, by both the interviewer and the interviewee using Robin's bag of words method


In [76]:
from collections import Counter
import re

str_YEARS = ['2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012']


def speech_to_bag(speech, lang, year=str_YEARS):
    """
    Return a bag of word sorted by frequency
    
    lang: {'fr','de'}
    username: {'PS Suisse', 'CVP PDC PPD PCD', 'UDC Suisse',
       'Les VERTS suisses 🌻', "Vert'libéraux Suisse", 'PLR Suisse'}
    year: default = [2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010]
    
    """
    speech_copy = speech.copy()
    speech_copy = speech_copy[speech_copy['year'].isin(year)]
    speech_copy = speech_copy["speechToTexts"].astype(str).str.lower()
    
    text = ""
    for t in speech_copy:
        if type(t) is not str:
            print(t)
            return
        text += ' ' +  t

    if (lang=='fr'):
        for c in punctuations_fr:
            text = text.replace(c, ' ')
        for w in stopwords_fr:
            regex = r" " + w + " "
            text = re.sub(regex, ' ', text)
            
        text = re.sub(regex, ' ', text)
        text = text.replace('  ', ' ')
        text = text.replace('  ', ' ')
        text = text.replace('  ', ' ')
        
        return Counter(text.split(" ")).most_common()
    else:
        raise NotImplementedError # TODO

In [77]:
# Stopwords in french (no information)
stopwords_fr = []
with open("data/stopwords/stopwords-fr.txt", 'r', encoding='utf-8') as f:
    stopwords_fr = f.readlines()
    stopwords_fr = [w.strip() for w in stopwords_fr]
stopwords_fr[:10]

['a', 'à', 'abord', 'absolument', 'afin', 'ah', 'ai', 'aie', 'aient', 'aies']

In [85]:
punctuations_fr = "\"$%&'?!()*+,-./:;=@[\]^_`{|}'~«»´’–1234567890"
UDC_bag = speech_to_bag(UDC_pandas, lang='fr')
PDC_bag= speech_to_bag(PDC_pandas)
PS_bag = speech_to_bag(PS_pandas)
PLR_bag = speech_to_bag(PLR_pandas)
PES_bag = speech_to_bag(PES_pandas)
PVLbag = speech_to_bag(PVL_pandas)

TypeError: speech_to_bag() missing 1 required positional argument: 'lang'

In [84]:
PDC_bag

Unnamed: 0,id,internalArchiveId,source,mediaType,title,publicationDate,durationSec,genres,publicationChannelNames,thematicGeographicals,...,summary,thematicPersons,availabilityRequestState,mediaURL,imageURL,alternativeTitle,alternativeSummary,variantIds,remarks,webPublicationDate
0,sound-FB5CE047-D6AF-4FA8-A122-546872AF047C,FB5CE047-D6AF-4FA8-A122-546872AF047C,siranau,audio,"L'ancien évêque de Bâle, le Cardinal Kurt Koch...",2012-12-25T00:00:00Z,854,"[Interview, Débat]",[La Première],[Suisse],...,,,,,,,,,,
1,sound-5D81F1AC-270F-4444-BA5A-4777DCAD01C6,5D81F1AC-270F-4444-BA5A-4777DCAD01C6,siranau,audio,"Grâce au site Internet Politnetz.ch, les votes...",2012-12-22T00:00:00Z,160,[Interview],[La Première],[Suisse],...,"Grâce au site Politnetz.ch, nous savons tout d...","[Conseil des Etats, Conseil National, Politzne...",,,,,,,,
2,sound-3F45B5E2-C685-4EF9-9C24-212676F4F566,3F45B5E2-C685-4EF9-9C24-212676F4F566,siranau,audio,Le Conseil des Etats s'est prononcé en faveur ...,2012-12-13T00:00:00Z,78,[Interview],[La Première],[Suisse],...,Les sachets en plastique à usage unique : c'es...,,,,,,,,,
3,sound-F474F557-041D-4D12-B718-B900CB5E4356,F474F557-041D-4D12-B718-B900CB5E4356,siranau,audio,Ernesto Bertarelli propose de transformer une ...,2012-12-05T00:00:00Z,1012,[Interview],[La Première],"[Genève (ville), Suisse]",...,Le milliardaire genevois Ernesto Bertarelli ve...,"[Bertarelli, Ernesto, Merck Serono, Wyss, Hans...",,,,,,,,
4,sound-C28F8DFC-7997-4FEE-A02C-6B7F2444D370,C28F8DFC-7997-4FEE-A02C-6B7F2444D370,siranau,audio,Ueli Maurer a été élu président de la Confédér...,2012-12-05T00:00:00Z,193,"[Commentaire, Interview]",[La Première],[Suisse],...,Ueli Maurer sera bel et bien président de la C...,"[Maurer, Ueli]",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,sound-DEA5CD55-4609-4785-BC56-F27A1728C286,DEA5CD55-4609-4785-BC56-F27A1728C286,siranau,audio,"En Suisse, selon le journal dominical Sonntags...",2019-01-20T00:00:00Z,296,[Interview],[La Première],[Suisse],...,"Selon le journal dominical SonntagsZeitung, de...",[législatif national],,//www.rts.ch/play/radio/redirect/detail/10133562,http://www.rts.ch/10549432.image,[Des dizaines de parlementaires touchés par un...,"[Selon le journal dominical SonntagsZeitung, d...",[ece-10133562],CATINFO_JAN2019,2019-01-20T18:00:01Z
278,sound-F3D22625-E6BF-40E5-9B94-7DDBC55E2717,F3D22625-E6BF-40E5-9B94-7DDBC55E2717,siranau,audio,"Le groupe PDC au Chambres fédéral, demande un ...",2019-01-19T00:00:00Z,420,[Interview],[La Première],"[Union européenne, Suisse]",...,Il faut approfondir tous les aspects de l'acco...,[Union européenne],,//www.rts.ch/play/radio/redirect/detail/10132096,http://www.rts.ch/10549432.image,[Le PDC demande un plus grand délai de consult...,[Il faut approfondir tous les aspects de l'acc...,[ece-10132096],CATINFO_JAN2019,2019-01-19T17:31:08Z
279,sound-09ADCC0B-AE1B-448A-A290-E08EEC24EA25,09ADCC0B-AE1B-448A-A290-E08EEC24EA25,siranau,audio,"Petra Gössi, présidente du PLR suisse, a heurt...",2019-01-17T00:00:00Z,476,"[Commentaire, Interview]",[La Première],"[Suisse alémanique, Suisse italienne, Suisse r...",...,"Petra Gössi a évoqué, mercredi dans Forum, une...","[PLR, Gössi, Petra]",,//www.rts.ch/play/radio/redirect/detail/10126038,http://www.rts.ch/10549432.image,[Petra Gössi sous-entend que les politiciens l...,"[Petra Gössi a évoqué, mercredi dans Forum, un...",[ece-10126038],CATINFO_JAN2019,2019-01-17T17:34:01Z
280,sound-298F7630-A511-4357-BB45-3BC9D5D2D0BF,298F7630-A511-4357-BB45-3BC9D5D2D0BF,siranau,audio,"Interview de Marc Bühlmann, politologue à l'Un...",2019-10-05T00:00:00Z,342,[Interview],[La Première],[Suisse],...,"Le livre : « Konkordanz im Parlament», paru t...",[législatif national],,//www.rts.ch/play/radio/redirect/detail/10742882,http://www.rts.ch/10549432.image,"[La concordance politique, le cœur de l'équili...","[Interview de Marc Bühlmann, politologue à l'U...",[ece-10742882],CATINFO_OCT2019,2019-10-05T16:33:00Z


In [None]:
PS_bag

In [82]:
UDC_bag

[('', 31157),
 ('r', 14892),
 ('nlocuteur', 14741),
 ('ms', 9173),
 ('suisse', 4431),
 ('fs', 3311),
 ('faut', 2992),
 ('faire', 2913),
 ('udc', 2819),
 ('ui', 2237),
 ('conseil', 2197),
 ('de', 1986),
 ('mt', 1933),
 ('question', 1901),
 ('politique', 1858),
 ('crois', 1834),
 ('fédéral', 1764),
 ('parti', 1708),
 ('vraiment', 1549),
 ('pays', 1459),
 ('cas', 1452),
 ('chose', 1369),
 ('ù', 1360),
 ('monsieur', 1342),
 ('noui', 1302),
 ('initiative', 1259),
 ('veut', 1162),
 ('nalors', 1147),
 ('ans', 1122),
 ('accord', 1083),
 ('le', 1066),
 ('européenne', 1059),
 ('déjà', 1058),
 ('canton', 1019),
 ('temps', 1015),
 ('finalement', 997),
 ('effectivement', 989),
 ('pourcent', 964),
 ('exemple', 963),
 ('vrai', 956),
 ('la', 928),
 ('problème', 925),
 ('union', 904),
 ('président', 886),
 ('parlement', 883),
 ('côté', 846),
 ('petit', 830),
 ('national', 823),
 ('monde', 823),
 ('bonsoir', 817),
 ('nmais', 784),
 ('des', 778),
 ('choses', 776),
 ('moment', 773),
 ('voir', 769),
 ('est