### *Data Analysis*
## Core Analysis - LDA Approach
---


In [1]:
# Import necessary libraries
import nltk, re, pprint
import json
from nltk import word_tokenize
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import os.path 
import pandas as pd
import os
import re
from nltk.corpus import PlaintextCorpusReader 
from nltk.app import concordance
from nltk.corpus import BracketParseCorpusReader
import numpy as np
import statsmodels.formula.api as smf
import altair as alt
import tmtoolkit
import spacy as spacy
import logging, warnings
from tmtoolkit.corpus import Corpus
import gensim
from gensim import corpora, models
nltk.download('omw-1.4')
import pickle
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.gensim_models


  from pandas import Int64Index as NumericIndex
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


---
### 1. Prerequisites
---

In [2]:
# Import stopwords
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Add sample specific stop words that are redundant and have no substantial relevance; also add words that are project-specific stopwords
stopwords.add('president')	
stopwords.add('mr')
stopwords.add('ms')
stopwords.add('commission')
stopwords.add('congress')
stopwords.add('speaker')
stopwords.add('also')
stopwords.add('artificial')
stopwords.add('intelligence')
stopwords.add('digital')
stopwords.add('ai')
stopwords.add('pro')
stopwords.add('tempore')
stopwords.add('representative')
stopwords.add('thank')
stopwords.add('dear')
stopwords.add('rapporteur')
stopwords.add('lady')
stopwords.add('committee')
stopwords.add('report')
stopwords.add('legislation')
stopwords.add('like')
stopwords.add('subcommittee')
stopwords.add('gentleman')
stopwords.add('r')
stopwords.add('colleague')
stopwords.add('madam')
stopwords.add('ha')
stopwords.add('wa')
stopwords.add('for')
stopwords.add('in')
stopwords.add('-')
stopwords.add(',')
stopwords.add('and')
stopwords.add('house')


In [4]:
# Define new function
# NLTK’s Wordnet stores meanings of words, synonyms, antonyms, etc. - for ref, see: https://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html 
# WordNetLemmatizer gets the root, for ref, see: https://www.nltk.org/_modules/nltk/stem/wordnet.html
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Define tokenizer for nltk using RegexpTokenizer, to keep tokens that are alphanumeric characters, get rid off punctuation
tokenizer = RegexpTokenizer(r'\w+') 
# Define a noun tagger 
is_noun = lambda pos: pos[:2] == 'NN'

In [6]:
# Define a function to preprocess for LDA
def prepare_for_lda(text):
    text = ''.join(c for c in text if not c.isdigit())
    tokens = tokenizer.tokenize(text)
    tokens = [get_lemma(token) for token in tokens]
    tags = nltk.pos_tag(tokens)
    tokens = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] #keep only nouns
    tokens = [w.split() for w in tokens if w not in stopwords] # get rid off stopwords
    return tokens

In [7]:
# Change directory 
os.listdir('.')
os.chdir('/Users/charlottekaiser/Documents/uni/Hertie/master_thesis/00_data/50_analysis')

# Read in files
raw_eu02 = open("EU02_Democratic scrutiny of social media and the protection of fundamental rights.txt").read()
raw_eu03 = open("EU03_European strategy for data - Commission evaluation report on the implementation of the General Data Protection Regulation two years after its application.txt").read()
raw_eu11 = open("EU11_Digital Europe programme.txt").read()
raw_eu13 = open("EU13_Artificial intelligence in education, culture and the audiovisual sector.txt").read()
raw_eu14 = open("EU14_Digital future of Europe- digital single market and use of AI for European consumers.txt").read()
raw_eu15 = open("EU15_ Promoting gender equality in science, technology, engineering and mathematics - STEM - education and careers.txt").read()
raw_eu18 = open("EU18_Artificial intelligence in criminal law and its use by the police and judicial authorities in criminal matters.txt").read()
raw_eu21 = open("EU21_The outcome of the EU-US Trade and Technology Council.txt").read()
raw_us02 = open("US02_CONSUMER SAFETY TECHNOLOGY ACT.txt").read()
raw_us04 = open("US04_FEDERAL CAREER OPPORTUNITIES IN COMPUTER SCIENCE WORK ACT.txt").read()
raw_us06 = open("US06_75th ANNIVERSARY OF THE OFFICE OF NAVAL RESEARCH.txt").read()
raw_us09 = open("US09_MSI STEM ACHIEVEMENT ACT.txt").read()
raw_us10 = open("US10_National Defense Authorization Act.txt").read()
raw_us15 = open("US15_FUTURE OF RADAR.txt").read()
raw_us16 = open("US16_DEPARTMENT OF ENERGY SCIENCE FOR THE FUTURE ACT.txt").read()
raw_us18 = open("US18_STATEMENTS ON INTRODUCED BILLS AND JOINT RESOLUTIONS.txt").read()
raw_us20 = open("US20_INTRODUCTION OF THE TRANSATLANTIC TELECOMMUNICATIONS SECURITY ACT.txt").read()
raw_us32 = open("US32_NATIONAL PULSE MEMORIAL.txt").read()
raw_us37 = open("US37_ENDLESS FRONTIER ACT.txt").read()

# Apply LDA function
eu02 = prepare_for_lda(raw_eu02)
eu03 = prepare_for_lda(raw_eu03)
eu11 = prepare_for_lda(raw_eu11) 
eu13 = prepare_for_lda(raw_eu13) 
eu14 = prepare_for_lda(raw_eu14) 
eu15 = prepare_for_lda(raw_eu15) 
eu18 = prepare_for_lda(raw_eu18) 
eu21 = prepare_for_lda(raw_eu21)
us02 = prepare_for_lda(raw_us02) 
us04 = prepare_for_lda(raw_us04) 
us06 = prepare_for_lda(raw_us06) 
us09 = prepare_for_lda(raw_us09) 
us10 = prepare_for_lda(raw_us10) 
us15 = prepare_for_lda(raw_us15) 
us16 = prepare_for_lda(raw_us16) 
us18 = prepare_for_lda(raw_us18)
us20 = prepare_for_lda(raw_us20) 
us32 = prepare_for_lda(raw_us32)
us37 = prepare_for_lda(raw_us37) 

FileNotFoundError: [Errno 2] No such file or directory: 'EU02_Democratic scrutiny of social media and the protection of fundamental rights.txt'

In [None]:
# Build one joint corpus
corpus_joint = eu02 + eu03 + eu11 + eu13 + eu14 + eu15 + eu18 + eu21 + us02 + us04 + us06 + us09 + us10 + us15 + us16 + us18 + us20 + us32 + us37

# Build one corpus for all EU debates
corpus_eu = eu02 + eu03 + eu11 + eu13 + eu14 + eu15 + eu18 + eu21

# Build one corpus for all US debates
corpus_us = us02 + us04 + us06 + us09 + us10 + us15 + us16 + us18 + us20 + us32 + us37

---
### 2. Analysis 
---

In [None]:
# Change directory to save analysis results
os.chdir('/Users/charlottekaiser/Documents/uni/Hertie/master_thesis/20_results/10_analysis')

---
#### 2.2 Run Model 1 - LDA for EU
---

In [None]:
# Define dictionary
dictionary_eu = corpora.Dictionary(corpus_eu)
print(dictionary_eu) #we have 1692 unique tokens in eu14
id2word = dictionary_eu.token2id
pprint.pprint(dictionary_eu.token2id)

# Define LDA Model
bow_corpus_eu = [dictionary_eu.doc2bow(text) for text in corpus_eu]
NUM_TOPICS = 3
ldamodel_eu = gensim.models.ldamodel.LdaModel(bow_corpus_eu, num_topics = NUM_TOPICS, id2word=dictionary_eu, passes=15)
ldamodel_eu.save('model1.gensim')
topics = ldamodel_eu.print_topics(num_words=4)
for topic in topics:
    print(topic)

# Visualize LDA Model
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=DeprecationWarning) 
vis = pyLDAvis.gensim_models.prepare(ldamodel_eu, bow_corpus_eu, dictionary_eu)
vis

Dictionary(2655 unique tokens: ['paula', 'zacarias', 'office', 'council', 'week']...)
{'abandonment': 2631,
 'ability': 1289,
 'abolition': 470,
 'abortion': 2179,
 'absence': 719,
 'absent': 2061,
 'absolute': 1545,
 'abundance': 1211,
 'abuse': 1006,
 'accelerate': 1198,
 'acceleration': 1551,
 'acceptance': 1855,
 'access': 203,
 'accessibility': 1765,
 'accident': 1754,
 'accompany': 2088,
 'accord': 1424,
 'accordance': 811,
 'account': 181,
 'accountability': 138,
 'accuracy': 1975,
 'accusation': 677,
 'achieve': 1393,
 'achievement': 1890,
 'acquaintance': 711,
 'acquis': 2414,
 'acquisition': 1368,
 'act': 82,
 'action': 79,
 'activist': 754,
 'activity': 55,
 'actor': 86,
 'acts': 1842,
 'ad': 724,
 'adamowicz': 242,
 'adapt': 1523,
 'adaptation': 1603,
 'add': 1246,
 'addition': 93,
 'address': 519,
 'adequate': 267,
 'administration': 1377,
 'adopt': 456,
 'adoption': 103,
 'adult': 2330,
 'advance': 1356,
 'advancement': 1514,
 'advantage': 116,
 'adversary': 743,
 'advert

  default_term_info = default_term_info.sort_values(


In [None]:
pyLDAvis.save_html(vis, "lda_eu.html")
pyLDAvis.save_json(vis, "lda_eu.json")

In [None]:
# Check coherence
coherencemod_eu = CoherenceModel(model=ldamodel_eu, texts=corpus_eu, dictionary=dictionary_eu, coherence='c_v')
coherencemod_eu = coherencemod_eu.get_coherence()
print('Coherence Score for the Agenda Model:', coherencemod_eu)

Coherence Score for the Agenda Model: 0.8487084914521017


---
#### 2.3 Run Model 2 - LDA for xx
---

In [None]:
corpus = eu02 + eu03 + eu11 + eu13 + eu14 + eu15 + eu18 + eu21 + us02 + us04 + us06 + us09 + us10 + us15 + us16 + us18 + us20 + us32 + us37

In [None]:
dictionary = corpora.Dictionary(corpus)
print(dictionary) #we have 1692 unique tokens in eu14
id2word = dictionary.token2id
pprint.pprint(dictionary.token2id)

Dictionary(4870 unique tokens: ['paula', 'zacarias', 'office', 'council', 'week']...)
{'abandon': 3366,
 'abandonment': 2631,
 'ability': 1289,
 'abolition': 470,
 'abortion': 2179,
 'abraham': 3383,
 'abrams': 3516,
 'abreast': 3882,
 'absence': 719,
 'absent': 2061,
 'absolute': 1545,
 'absurdity': 4731,
 'abundance': 1211,
 'abuse': 1006,
 'academia': 2776,
 'academy': 2857,
 'accelerate': 1198,
 'acceleration': 1551,
 'acceptance': 1855,
 'access': 203,
 'accessibility': 1765,
 'accession': 3258,
 'accident': 1754,
 'accommodation': 4052,
 'accompany': 2088,
 'accomplishment': 4448,
 'accord': 1424,
 'accordance': 811,
 'account': 181,
 'accountability': 138,
 'accountable': 2973,
 'accountant': 3904,
 'accounting': 3125,
 'accuracy': 1975,
 'accusation': 677,
 'acetaminophen': 4318,
 'acheson': 4657,
 'achieve': 1393,
 'achievement': 1890,
 'acknowledgement': 3526,
 'acoustics': 2801,
 'acquaintance': 711,
 'acquis': 2414,
 'acquisition': 1368,
 'acres': 3457,
 'act': 82,
 'action

In [None]:
bow_corpus = [dictionary.doc2bow(text) for text in corpus]
NUM_TOPICS = 4
ldamodel = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.057*"bill" + 0.029*"defense" + 0.018*"job" + 0.013*"company"')
(1, '0.032*"technology" + 0.028*"world" + 0.026*"innovation" + 0.026*"time"')
(2, '0.040*"research" + 0.036*"people" + 0.029*"investment" + 0.023*"security"')
(3, '0.050*"state" + 0.042*"china" + 0.022*"act" + 0.021*"today"')


In [None]:
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=DeprecationWarning) 
vis = pyLDAvis.gensim_models.prepare(ldamodel, bow_corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [None]:
pyLDAvis.save_html(vis, "lda.html")
pyLDAvis.save_json(vis, "lda.json")

In [None]:
coherencemod = CoherenceModel(model=ldamodel, texts=corpus, dictionary=dictionary, coherence='c_v')
coherencemod = coherencemod.get_coherence()
print('Coherence Score for the whole model:', coherencemod)

Coherence Score for the whole model: 0.8495959197825642


---
#### 2.2 Run LDA for EU
---

In [None]:
dictionary_eu = corpora.Dictionary(eu18)
print(dictionary_eu) #we have 1692 unique tokens in eu14
id2word_eu = dictionary_eu.token2id
pprint.pprint(dictionary_eu.token2id)

Dictionary(652 unique tokens: ['petar', 'vitanov', 'eu', 'framework', 'need']...)
{'ability': 360,
 'abuse': 132,
 'accordance': 362,
 'account': 288,
 'accountability': 192,
 'accuracy': 329,
 'achievement': 485,
 'acquis': 629,
 'act': 632,
 'action': 92,
 'activity': 466,
 'addition': 326,
 'administration': 506,
 'adult': 398,
 'affairs': 46,
 'agency': 530,
 'agriculture': 271,
 'aim': 223,
 'algorithm': 418,
 'amendment': 222,
 'amount': 130,
 'analysis': 256,
 'anchor': 177,
 'andrej': 236,
 'anonymity': 79,
 'anticipate': 493,
 'application': 33,
 'approach': 176,
 'area': 71,
 'art': 122,
 'article': 322,
 'aspect': 625,
 'assembly': 316,
 'assessment': 87,
 'asset': 606,
 'association': 317,
 'attache': 536,
 'attack': 108,
 'authorisation': 299,
 'authorities': 179,
 'authorization': 563,
 'avenue': 461,
 'avoidance': 227,
 'babiš': 237,
 'baby': 305,
 'balance': 175,
 'ban': 220,
 'bank': 449,
 'barge': 368,
 'barrier': 518,
 'base': 29,
 'basis': 419,
 'bathwater': 306,
 '

In [None]:
bow_corpus_eu = [dictionary_eu.doc2bow(text) for text in eu18]
NUM_TOPICS = 3
ldamodel_eu = gensim.models.ldamodel.LdaModel(bow_corpus_eu, num_topics = NUM_TOPICS, id2word=dictionary_eu, passes=15)
ldamodel_eu.save('model5.gensim')
topics_eu = ldamodel_eu.print_topics(num_words=4)
for topic in topics_eu:
    print(topic)

(0, '0.049*"use" + 0.033*"enforcement" + 0.028*"citizen" + 0.024*"recognition"')
(1, '0.058*"right" + 0.044*"technology" + 0.025*"crime" + 0.024*"group"')
(2, '0.050*"law" + 0.033*"police" + 0.029*"system" + 0.026*"risk"')


In [None]:
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=DeprecationWarning) 
vis_eu = pyLDAvis.gensim_models.prepare(ldamodel_eu, bow_corpus_eu, dictionary_eu)
vis_eu

  default_term_info = default_term_info.sort_values(


In [None]:
coherence_model_lda_eu = CoherenceModel(model=ldamodel_eu, texts=eu18, dictionary=dictionary_eu, coherence='c_v')
coherence_lda_eu = coherence_model_lda_eu.get_coherence()
print('Coherence Score for the EU model:', coherence_lda_eu)

Coherence Score for the EU model: 0.8546907759896699


---
#### 2.2 Run LDA for US
---

In [None]:
dictionary_us = corpora.Dictionary(us02)
print(dictionary_us) #we have 588 unique tokens in us02
id2word_us = dictionary_us.token2id
pprint.pprint(dictionary_us.token2id)

Dictionary(296 unique tokens: ['pallone', 'rule', 'bill', 'consumer', 'product']...)
{'act': 26,
 'actor': 267,
 'addition': 244,
 'adoption': 106,
 'advance': 32,
 'advantage': 274,
 'agency': 31,
 'algorithm': 189,
 'anniversary': 155,
 'area': 229,
 'attack': 253,
 'author': 117,
 'ayes': 286,
 'balance': 86,
 'barrier': 263,
 'battery': 121,
 'benefit': 147,
 'bilirakis': 80,
 'bill': 2,
 'blockchain': 13,
 'blockchains': 115,
 'burgess': 28,
 'business': 178,
 'businesspeople': 273,
 'california': 112,
 'call': 84,
 'capability': 191,
 'capacity': 144,
 'carry': 116,
 'center': 139,
 'century': 203,
 'certainty': 212,
 'cftc': 248,
 'chain': 101,
 'chair': 285,
 'chairman': 91,
 'chairwoman': 81,
 'change': 208,
 'charger': 122,
 'china': 196,
 'clause': 293,
 'climate': 214,
 'co': 240,
 'commerce': 11,
 'commissioner': 130,
 'company': 177,
 'compete': 102,
 'competitor': 197,
 'consent': 16,
 'constituent': 262,
 'consultation': 235,
 'consumer': 3,
 'consumption': 183,
 'conti

In [None]:
bow_corpus_us = [dictionary_us.doc2bow(text) for text in us02]
NUM_TOPICS = 3
ldamodel_us = gensim.models.ldamodel.LdaModel(bow_corpus_us, num_topics = NUM_TOPICS, id2word=dictionary_us, passes=15)
ldamodel_us.save('model5.gensim')
topics_us = ldamodel_us.print_topics(num_words=4) 
for topic in topics_us:
    print(topic)


(0, '0.128*"consumer" + 0.070*"technology" + 0.025*"pallone" + 0.025*"innovation"')
(1, '0.063*"safety" + 0.043*"blockchain" + 0.040*"agency" + 0.040*"h"')
(2, '0.123*"product" + 0.078*"bill" + 0.061*"act" + 0.032*"time"')


In [None]:
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=DeprecationWarning) 
vis_us = pyLDAvis.gensim_models.prepare(ldamodel_us, bow_corpus_us, dictionary_us)
vis_us

  default_term_info = default_term_info.sort_values(


In [None]:
coherence_model_lda_us = CoherenceModel(model=ldamodel_us, texts=us02, dictionary=dictionary_us, coherence='c_v')
coherence_lda_us = coherence_model_lda_us.get_coherence()
print('Coherence Score for the US model: ', coherence_lda_us)

Coherence Score for the US model:  0.8520903032489898
