In [1]:
import LexExpander
from CachedEmbedding import CachedEmbedding
import random
import sys
import pandas as pd
from tqdm import tqdm
from contextlib import redirect_stdout

In [4]:
# Filepaths
# EMBEDDINGFILE = "/home/dafne/shared/FilterBubble/word-embeddings/fasttext.model.bin"
EMBEDDINGFILE = "/home/dafne/shared/FilterBubble/word-embeddings/fasttext.model50.bin"
issue_file = "/home/dafne/shared/FilterBubble/issue-positions/issues-queries.csv"

In [5]:
# Load issue file
issues_df = pd.read_csv(issue_file)
issues_df.parentI.value_counts()

klimaatMilieu          112
ondernemingsklimaat     71
Infrastructuur          69
ontwikkelingCrim        66
coronabestrijding       65
OenW                    65
geZorg                  61
socZek                  56
belastHeffing           53
bestrijdingCrim         45
integratie              35
vrijheidsrechten        29
werk                    25
coronaverspreiding      25
EuropeseUnie            24
normenWaarden           22
begrotingssaldo         20
woning                  18
terreurbestrijding      14
bestVernieuw            13
ontwikHulp              12
infrastructuur           1
Name: parentI, dtype: int64

In [6]:
issues_df.head()

Unnamed: 0,queryIssue,directI,parentI,gparentI,parentA,gparentA,zoek1,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,aardgasbaten,pos,begrotingssaldo,issue,,,,,,,
1,ambtenarensal*,neg,begrotingssaldo,issue,,,,,,,
2,begrotingsdiscipline,pos,begrotingssaldo,issue,,,,,,,
3,begrotingsnorm,pos,begrotingssaldo,issue,,,,,,,
4,begrotingsoverschot,pos,begrotingssaldo,issue,,,,,,,


In [7]:
lexicons = dict(issues_df.groupby('parentI').queryIssue.agg(list))
# Some filtering and cleaning
lexicons = {issue: [query.replace('*','').replace(' ','_') for query in lex] 
            for issue, lex in lexicons.items()
           if len(lex)>1}

In [8]:
# example:
lexicons['ontwikHulp']

['artsen_zonder_grenzen',
 'cordaid',
 'dekolonisatie',
 'derde_wereld',
 'ICTO',
 'kolonialisme',
 'noodhulp',
 'ontwikkelingsh',
 'ontwikkelingslanden',
 'ontwikkelingsorg',
 'ontwikkelingss',
 'unicef']

In [9]:
# Load embedding
embedding_model = CachedEmbedding(EMBEDDINGFILE, embedding_style="fasttext")
embedding_model.prepare()




In [10]:
# Define some parameters for tuning
sample_size = 3
new_topn_to_draw_from = 10

In [18]:
expanded_lexicon = {}

for issue, lexicon in tqdm(lexicons.items()): #[('ontwikHulp', lexicons['ontwikHulp'])]:
    print('Issue {}'.format(issue))
    
    logging_file = '{}.log.txt'.format(issue)
    my_expander = LexExpander.LexExpander(lexicon_iterable = lexicon,
                                      embedding_model = embedding_model,
                                      sample_size = sample_size,
                                      new_topn_to_draw_from = new_topn_to_draw_from
                                        )
    my_expander.prepare(verbose=True)
    print("Running algorithm..")
    seed_set = random.sample(my_expander.loaded_lex_as_set, 5)
    with open(logging_file, 'w') as logout:
        with redirect_stdout(logout):
            my_expander.run(sample="random",given_sampling_population=seed_set)
    expanded_lexicon[issue] = (my_expander.added_to_lexicon, my_expander.given_lexicon_set_WEAK)
    print("Done")

  0%|          | 0/21 [00:00<?, ?it/s]

Issue EuropeseUnie
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


  5%|▍         | 1/21 [00:11<03:58, 11.94s/it]

Done
Issue Infrastructuur
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 10%|▉         | 2/21 [00:20<03:10, 10.01s/it]

Done
Issue OenW
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 14%|█▍        | 3/21 [00:30<03:02, 10.13s/it]

Done
Issue begrotingssaldo
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 19%|█▉        | 4/21 [00:41<02:57, 10.47s/it]

Done
Issue belastHeffing
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 24%|██▍       | 5/21 [00:51<02:40, 10.03s/it]

Done
Issue bestVernieuw
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 29%|██▊       | 6/21 [00:59<02:19,  9.32s/it]

Done
Issue bestrijdingCrim
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 33%|███▎      | 7/21 [01:04<01:54,  8.17s/it]

Done
Issue coronabestrijding
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 38%|███▊      | 8/21 [01:11<01:39,  7.65s/it]

Done
Issue coronaverspreiding
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 43%|████▎     | 9/21 [01:17<01:26,  7.25s/it]

Done
Issue geZorg
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 48%|████▊     | 10/21 [01:26<01:24,  7.64s/it]

Done
Issue integratie
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 52%|█████▏    | 11/21 [01:33<01:14,  7.44s/it]

Done
Issue klimaatMilieu
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 57%|█████▋    | 12/21 [01:43<01:14,  8.30s/it]

Done
Issue normenWaarden
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 62%|██████▏   | 13/21 [01:51<01:04,  8.06s/it]

Done
Issue ondernemingsklimaat
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 67%|██████▋   | 14/21 [02:00<00:58,  8.39s/it]

Done
Issue ontwikHulp
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 71%|███████▏  | 15/21 [02:08<00:50,  8.41s/it]

Done
Issue ontwikkelingCrim
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 76%|███████▌  | 16/21 [02:21<00:48,  9.73s/it]

Done
Issue socZek
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 81%|████████  | 17/21 [02:33<00:42, 10.51s/it]

Done
Issue terreurbestrijding
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 86%|████████▌ | 18/21 [02:42<00:30, 10.05s/it]

Done
Issue vrijheidsrechten
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 90%|█████████ | 19/21 [02:56<00:22, 11.21s/it]

Done
Issue werk
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


 95%|█████████▌| 20/21 [03:01<00:09,  9.35s/it]

Done
Issue woning
reading in the lexicon ...
setting up embeddings ...
using passed model <CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>

            10 iterations
            1 keep_size
            4 new_in_lex_topn_to_draw_from
            3 sample_size
            10 new_topn_to_draw_from
            100 result_size
            3 rec_runs
            0.5 weak_add_rate

            

            self.lexicon_file:	None
            embedding_model:	<CachedEmbedding.CachedEmbedding object at 0x7f43ee43b090>
            candidates_list:	None

            target_folder:	.
            
Running algorithm..


100%|██████████| 21/21 [03:08<00:00,  9.00s/it]

Done





In [21]:
# Print out the proposed expansions, both strong and weak

for issue in expanded_lexicon:
    print("Issue: ", issue)
    print("Existing lexicon:")
    print(', '.join(lexicons[issue]))
    print("\nNew strong:")
    print(', '.join(expanded_lexicon[issue][0]))
    print("\nNew weak:")
    print(', '.join(expanded_lexicon[issue][1]))
    print('\n-------------------------------------------------------\n')

Issue:  EuropeseUnie
Existing lexicon:
3_procent, 60_procent, brexit, coronaherstelfonds, ECB, EMA, EU, europarlementarier, Europe_Parlement, Europees_Medicijn_Agentschap, Europese_Centrale_Bank, Europese_Commissie, Europese_regeringsleiders, Europese_Unie, euroscep, eurozone, geneesmiddelenbureau, herstelfonds, Hof_van_Justitie, lidstaten, medicijnagentschap, meerjarenbegroting, Nexit, raad_van_ministers

New strong:


New weak:


-------------------------------------------------------

Issue:  Infrastructuur
Existing lexicon:
chipkaart, airbus, AirFrance, airport, auto, automobiliteit, benzinestation, bestemmingsplan, bestemmingsverkeer, Betuwelijn, bevoorrading, Boeing, dijkverzwaring, distributie, doorvoer, Eurostar, file, glasvezel, haven, hub, infrastructu, internetprovider, KLM, laadpa, landingen, landingsba, lightrail, logisti, luchthaven, luchtvaart, mobiliteit, nachtvluchten, netwerkprovider, NS, overloopluchthaven, pakketbezorg, pakketdienst, parkeer, post, prorail, rail, Ra