### *Data Analysis*
## Core Analysis - LSA Approach
---


In [15]:
# Import necessary libraries
import nltk, re, pprint
import json
from nltk import word_tokenize
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import os.path 
import pandas as pd
import os
import re
from nltk.corpus import PlaintextCorpusReader 
from nltk.app import concordance
from nltk.corpus import BracketParseCorpusReader
import numpy as np
import statsmodels.formula.api as smf
import altair as alt
import tmtoolkit
import spacy as spacy
import logging, warnings
from tmtoolkit.corpus import Corpus
import gensim
from gensim import corpora, models
nltk.download('omw-1.4')
import pickle
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.gensim_models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
# Import stopwords
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Add sample specific stop words that are redundant and have no substantial relevance; also add words that are project-specific stopwords
stopwords.add('president')	
stopwords.add('mr')
stopwords.add('ms')
stopwords.add('commission')
stopwords.add('congress')
stopwords.add('speaker')
stopwords.add('also')
stopwords.add('artificial')
stopwords.add('intelligence')
stopwords.add('digital')
stopwords.add('ai')
stopwords.add('pro')
stopwords.add('tempore')
stopwords.add('representative')
stopwords.add('thank')
stopwords.add('dear')
stopwords.add('rapporteur')
stopwords.add('lady')
stopwords.add('committee')
stopwords.add('report')
stopwords.add('legislation')
stopwords.add('like')
stopwords.add('subcommittee')
stopwords.add('gentleman')
stopwords.add('r')
stopwords.add('colleague')
stopwords.add('madam')



In [24]:
# Define new function
# NLTKâ€™s Wordnet stores meanings of words, synonyms, antonyms, etc. - for ref, see: https://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html 
# WordNetLemmatizer gets the root, for ref, see: https://www.nltk.org/_modules/nltk/stem/wordnet.html
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
# Define tokenizer for nltk using RegexpTokenizer, to keep tokens that are alphanumeric characters, get rid off punctuation
tokenizer = RegexpTokenizer(r'\w+') 
# Define a noun tagger 
is_noun = lambda pos: pos[:2] == 'NN'

In [62]:
# Define a function to preprocess for LDA
def prepare_for_lda(text):
    text = ''.join(c for c in text if not c.isdigit())
    tokens = tokenizer.tokenize(text)
    tokens = [get_lemma(token) for token in tokens]
    tags = nltk.pos_tag(tokens)
    tokens = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] #keep only nouns
    tokens = [w.split() for w in tokens if w not in stopwords] # get rid off stopwords
    return tokens

In [58]:
os.listdir('.')
os.chdir('/Users/charlottekaiser/Documents/uni/Hertie/master_thesis/00_data/40_relevant_debates')

In [59]:
documents = [
"EU02_Democratic scrutiny of social media and the protection of fundamental rights.txt", 
"EU03_European strategy for data - Commission evaluation report on the implementation of the General Data Protection Regulation two years after its application.txt",
"EU11_Digital Europe programme.txt", 
"EU13_Artificial intelligence in education, culture and the audiovisual sector.txt", 
"EU14_Digital future of Europe- digital single market and use of AI for European consumers.txt",
"EU15_ Promoting gender equality in science, technology, engineering and mathematics - STEM - education and careers.txt",
"EU18_Artificial intelligence in criminal law and its use by the police and judicial authorities in criminal matters.txt", 
"EU21_The outcome of the EU-US Trade and Technology Council.txt", 
"US02_CONSUMER SAFETY TECHNOLOGY ACT.txt", 
"US04_FEDERAL CAREER OPPORTUNITIES IN COMPUTER SCIENCE WORK ACT.txt", 
"US06_75th ANNIVERSARY OF THE OFFICE OF NAVAL RESEARCH.txt",
"US09_MSI STEM ACHIEVEMENT ACT.txt", 
"US10_National Defense Authorization Act.txt", 
"US15_FUTURE OF RADAR.txt", "US16_DEPARTMENT OF ENERGY SCIENCE FOR THE FUTURE ACT.txt",
"US18_STATEMENTS ON INTRODUCED BILLS AND JOINT RESOLUTIONS.txt", 
"US20_INTRODUCTION OF THE TRANSATLANTIC TELECOMMUNICATIONS SECURITY ACT.txt",
"US32_NATIONAL PULSE MEMORIAL.txt", 
"US37_ENDLESS FRONTIER ACT.txt"]

In [60]:
raw_eu02 = open("EU02_Democratic scrutiny of social media and the protection of fundamental rights.txt").read()
raw_eu03 = open("EU03_European strategy for data - Commission evaluation report on the implementation of the General Data Protection Regulation two years after its application.txt").read()
raw_eu11 = open("EU11_Digital Europe programme.txt").read()
raw_eu13 = open("EU13_Artificial intelligence in education, culture and the audiovisual sector.txt").read()
raw_eu14 = open("EU14_Digital future of Europe- digital single market and use of AI for European consumers.txt").read()
raw_eu15 = open("EU15_ Promoting gender equality in science, technology, engineering and mathematics - STEM - education and careers.txt").read()
raw_eu18 = open("EU18_Artificial intelligence in criminal law and its use by the police and judicial authorities in criminal matters.txt").read()
raw_eu21 = open("EU21_The outcome of the EU-US Trade and Technology Council.txt").read()
raw_us02 = open("US02_CONSUMER SAFETY TECHNOLOGY ACT.txt").read()
raw_us04 = open("US04_FEDERAL CAREER OPPORTUNITIES IN COMPUTER SCIENCE WORK ACT.txt").read()
raw_us06 = open("US06_75th ANNIVERSARY OF THE OFFICE OF NAVAL RESEARCH.txt").read()
raw_us09 = open("US09_MSI STEM ACHIEVEMENT ACT.txt").read()
raw_us10 = open("US10_National Defense Authorization Act.txt").read()
raw_us15 = open("US15_FUTURE OF RADAR.txt").read()
raw_us16 = open("US16_DEPARTMENT OF ENERGY SCIENCE FOR THE FUTURE ACT.txt").read()
raw_us18 = open("US18_STATEMENTS ON INTRODUCED BILLS AND JOINT RESOLUTIONS.txt").read()
raw_us20 = open("US20_INTRODUCTION OF THE TRANSATLANTIC TELECOMMUNICATIONS SECURITY ACT.txt").read()
raw_us32 = open("US32_NATIONAL PULSE MEMORIAL.txt").read()
raw_us37 = open("US37_ENDLESS FRONTIER ACT.txt").read()

In [50]:
docs = [raw_eu02, raw_eu03]

In [73]:
# assign data of lists.  
data = {'debate': ['EU02_Democratic scrutiny of social media', 
'EU03_European strategy for data'], 
'text': [raw_eu02, raw_eu03]}  
  
# Create DataFrame  
df = pd.DataFrame(data)  
  
# Print the output.  
df

Unnamed: 0,debate,text
0,EU02_Democratic scrutiny of social media,"['ana', 'paula', 'zacarias', ',', 'president-i..."
1,EU03_European strategy for data,"['henna', 'virkkunen', '(', 'ppe', ')', '.', '..."


In [64]:
df['clean_text']=df['text'].apply(prepare_for_lda)

In [65]:
df.drop(['text'],axis=1,inplace=True)

In [66]:
df

Unnamed: 0,debate,clean_text
0,EU02_Democratic scrutiny of social media,"[[paula], [zacarias], [office], [council], [we..."
1,EU03_European strategy for data,"[[henna], [virkkunen], [ppe], [year], [ha], [a..."
