In [None]:
### *Data Analysis*
## Analysis - Try and Error code (just playing around and storing code)
---

In [None]:
# Change directory 
os.listdir('.')
os.chdir('/Users/charlottekaiser/Documents/uni/Hertie/master_thesis/00_data/50_analysis/text_files')



# Read in files
raw_eu02 = open("EU02_Democratic scrutiny of social media and the protection of fundamental rights.txt").read()
raw_eu03 = open("EU03_European strategy for data - Commission evaluation report on the implementation of the General Data Protection Regulation two years after its application.txt").read()
raw_eu11 = open("EU11_Digital Europe programme.txt").read()
raw_eu13 = open("EU13_Artificial intelligence in education, culture and the audiovisual sector.txt").read()
raw_eu14 = open("EU14_Digital future of Europe- digital single market and use of AI for European consumers.txt").read()
raw_eu15 = open("EU15_ Promoting gender equality in science, technology, engineering and mathematics - STEM - education and careers.txt").read()
raw_eu18 = open("EU18_Artificial intelligence in criminal law and its use by the police and judicial authorities in criminal matters.txt").read()
raw_eu21 = open("EU21_The outcome of the EU-US Trade and Technology Council.txt").read()
raw_us02 = open("US02_CONSUMER SAFETY TECHNOLOGY ACT.txt").read()
raw_us04 = open("US04_FEDERAL CAREER OPPORTUNITIES IN COMPUTER SCIENCE WORK ACT.txt").read()
raw_us06 = open("US06_75th ANNIVERSARY OF THE OFFICE OF NAVAL RESEARCH.txt").read()
raw_us09 = open("US09_MSI STEM ACHIEVEMENT ACT.txt").read()
raw_us10 = open("US10_National Defense Authorization Act.txt").read()
raw_us15 = open("US15_FUTURE OF RADAR.txt").read()
raw_us16 = open("US16_DEPARTMENT OF ENERGY SCIENCE FOR THE FUTURE ACT.txt").read()
raw_us18 = open("US18_STATEMENTS ON INTRODUCED BILLS AND JOINT RESOLUTIONS.txt").read()
raw_us20 = open("US20_INTRODUCTION OF THE TRANSATLANTIC TELECOMMUNICATIONS SECURITY ACT.txt").read()
raw_us32 = open("US32_NATIONAL PULSE MEMORIAL.txt").read()
raw_us37 = open("US37_ENDLESS FRONTIER ACT.txt").read()

# Apply LDA function
eu02 = prepare_for_lda(raw_eu02)
eu03 = prepare_for_lda(raw_eu03)
eu11 = prepare_for_lda(raw_eu11) 
eu13 = prepare_for_lda(raw_eu13) 
eu14 = prepare_for_lda(raw_eu14) 
eu15 = prepare_for_lda(raw_eu15) 
eu18 = prepare_for_lda(raw_eu18) 
eu21 = prepare_for_lda(raw_eu21)
us02 = prepare_for_lda(raw_us02) 
us04 = prepare_for_lda(raw_us04) 
us06 = prepare_for_lda(raw_us06) 
us09 = prepare_for_lda(raw_us09) 
us10 = prepare_for_lda(raw_us10) 
us15 = prepare_for_lda(raw_us15) 
us16 = prepare_for_lda(raw_us16) 
us18 = prepare_for_lda(raw_us18)
us20 = prepare_for_lda(raw_us20) 
us32 = prepare_for_lda(raw_us32)
us37 = prepare_for_lda(raw_us37) 

In [None]:
# Build one joint corpus
corpus_joint = eu02 + eu03 + eu11 + eu13 + eu14 + eu15 + eu18 + eu21 + us02 + us04 + us06 + us09 + us10 + us15 + us16 + us18 + us20 + us32 + us37

# Build one corpus for all EU debates
corpus_eu = eu02 + eu03 + eu11 + eu13 + eu14 + eu15 + eu18 + eu21

# Build one corpus for all US debates
corpus_us = us02 + us04 + us06 + us09 + us10 + us15 + us16 + us18 + us20 + us32 + us37

In [None]:
# Define new function
# Vectorize the vocabulary
def vectorize(tokens):
    vector=[]
    for w in corpus['text']:
        vector.append(tokens.count(w))
    return vector

In [None]:
# Prepare the word2vec models
model_full = gensim.models.Word2Vec(
    corpus['without_stopwords'],
    vector_size=100,
    window=10,
    min_count=2,
    workers=10,
    epochs=10)



In [None]:
keys = ['innovation', 'technology'] # Eu technology, privacy, human

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model_full.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model_full.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)


In [None]:
from sklearn.manifold import TSNE
import numpy as np

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline


def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('The occurrence of pre-defined terms in the US corpus', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')

In [None]:
agenda_eu02 = "Democratic scrutiny of social media and the protection of fundamental rights"
agenda_eu03 = "European strategy for data - Commission evaluation report on the implementation of the General Data Protection Regulation two years after its application"
agenda_eu11 = "Digital Europe programme"
agenda_eu13 = "Artificial intelligence in education, culture and the audiovisual sector"
agenda_eu14 = "Digital future of Europe- digital single market and use of AI for European consumers"
agenda_eu15 = "Promoting gender equality in science, technology, engineering and mathematics - STEM - education and careers"
agenda_eu18 = "Artificial intelligence in criminal law and its use by the police and judicial authorities in criminal matters"
agenda_eu21 = "The outcome of the EU-US Trade and Technology Council"
agenda_us02 = "CONSUMER SAFETY TECHNOLOGY ACT"
agenda_us04 = "FEDERAL CAREER OPPORTUNITIES IN COMPUTER SCIENCE WORK ACT"
agenda_us06 = "75th ANNIVERSARY OF THE OFFICE OF NAVAL RESEARCH"
agenda_us09 = "MSI STEM ACHIEVEMENT ACT"
agenda_us10 = "National Defense Authorization Act"
agenda_us15 = "FUTURE OF RADAR"
agenda_us16 = "US16_DEPARTMENT OF ENERGY SCIENCE FOR THE FUTURE ACT"
agenda_us18 = "STATEMENTS ON INTRODUCED BILLS AND JOINT RESOLUTIONS"
agenda_us20 = "INTRODUCTION OF THE TRANSATLANTIC TELECOMMUNICATIONS SECURITY ACT"
agenda_us32 = "NATIONAL PULSE MEMORIAL"
agenda_us37 = "ENDLESS FRONTIER ACT"

In [None]:
agenda_eu02 = word_tokenize(agenda_eu02)
agenda_eu03 = word_tokenize(agenda_eu03)
agenda_eu11 = word_tokenize(agenda_eu11)
agenda_eu13 = word_tokenize(agenda_eu13)
agenda_eu14 = word_tokenize(agenda_eu14)
agenda_eu15 = word_tokenize(agenda_eu15)
agenda_eu18 = word_tokenize(agenda_eu18)
agenda_eu21 = word_tokenize(agenda_eu21)
agenda_us02 = word_tokenize(agenda_us02)
agenda_us04 = word_tokenize(agenda_us04)
agenda_us06 = word_tokenize(agenda_us06)
agenda_us09 = word_tokenize(agenda_us09)
agenda_us10 = word_tokenize(agenda_us10)
agenda_us15 = word_tokenize(agenda_us15)
agenda_us16 = word_tokenize(agenda_us16)
agenda_us18 = word_tokenize(agenda_us18)
agenda_us20 = word_tokenize(agenda_us20)
agenda_us32 = word_tokenize(agenda_us32)
agenda_us37 = word_tokenize(agenda_us37)


In [None]:
agendas_eu = [
agenda_eu02, 
agenda_eu03,
agenda_eu11,
agenda_eu13,
agenda_eu14,
agenda_eu15,
agenda_eu18,
agenda_eu21
]

agendas_us = [
agenda_us02,
agenda_us04,
agenda_us06,
agenda_us09,
agenda_us10,
agenda_us15,
agenda_us16,
agenda_us18,
agenda_us20,
agenda_us32,
agenda_us37
]

In [None]:
for text in agendas_eu:
    text = prepare_for_lda(text)

for text in agendas_us:
    text = prepare_for_lda(text)