### *Data Analysis*
##  Pre-processing
---
Goal: 
- process the relevant debate text files and create a dataframe with level (EU = 1, US =0), debate title and text
- conduct preprocessing steps such as lemmatizing, get rid of punctuations,  <br> <br>
*conducted in March 2022*

In [43]:
# Import necessary libraries
import nltk, re, pprint
import json
from nltk import word_tokenize
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import nltk.data
import os.path 
import glob
import pandas as pd
import os
import re
from nltk.corpus import PlaintextCorpusReader 
from nltk.app import concordance
from nltk.corpus import BracketParseCorpusReader
import numpy as np
import contractions
import statsmodels.formula.api as smf
import altair as alt
import tmtoolkit
import spacy as spacy
import logging, warnings
from tmtoolkit.corpus import Corpus
import gensim
from gensim import corpora, models
nltk.download('omw-1.4')
import pickle
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.gensim_models


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


---
### 1. Prerequisites and pre-processing
---

In [31]:
# Import stopwords
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
# Add sample specific stop words that are redundant and have no substantial relevance; also add words that are project-specific stopwords
stopwords.add('president')
stopwords.add('mr')
stopwords.add('ms')
stopwords.add('commission')
stopwords.add('congress')
stopwords.add('speaker')
stopwords.add('also')
stopwords.add('artificial')
stopwords.add('intelligence')
stopwords.add('digital')
stopwords.add('ai')
stopwords.add('pro')
stopwords.add('tempore')
stopwords.add('representative')
stopwords.add('thank')
stopwords.add('dear')
stopwords.add('rapporteur')
stopwords.add('lady')
stopwords.add('committee')
stopwords.add('report')
stopwords.add('legislation')
stopwords.add('like')
stopwords.add('subcommittee')
stopwords.add('gentleman')
stopwords.add('r')
stopwords.add('colleague')
stopwords.add('madam')
stopwords.add('ha')
stopwords.add('wa')
stopwords.add('for')
stopwords.add('in')
stopwords.add('-')
stopwords.add(',')
stopwords.add('and')
stopwords.add('house')
stopwords.add('chairwoman')
stopwords.add('sponsor')
stopwords.add('gentlewoman')
stopwords.add('verts')
stopwords.add('renew')

In [33]:
# Define new function
# NLTK’s Wordnet stores meanings of words, synonyms, antonyms, etc. - for ref, see: https://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html 
# WordNetLemmatizer gets the root, for ref, see: https://www.nltk.org/_modules/nltk/stem/wordnet.html
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charlottekaiser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
# Define tokenizer for nltk using RegexpTokenizer, to keep tokens that are alphanumeric characters, get rid off punctuation
tokenizer = RegexpTokenizer(r'\w+') 
# Define a noun tagger 
is_noun = lambda pos: pos[:2] == 'NN'

In [35]:
# Define a function to preprocess for LDA
def prepare_for_lda(text):
    text = ''.join(c for c in text if not c.isdigit())
    tokens = tokenizer.tokenize(text)
    tokens = [get_lemma(token) for token in tokens]
    tags = nltk.pos_tag(tokens)
    # tokens = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] #keep only nouns
    return tokens

In [44]:
os.chdir('/Users/charlottekaiser/Documents/uni/Hertie/master_thesis/00_data/50_analysis')
with open('raw_corpus_aggregate.csv') as f:
    df = pd.read_csv(f)
f.close()
df.drop('Unnamed: 0', axis=1, inplace=True)

In [38]:
for row in df:
    text = row[3:]
    text = prepare_for_lda(text)

In [39]:
df['without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

In [40]:
df.head()

Unnamed: 0,eu,debate,text,without_stopwords
0,1,EU02_Democratic scrutiny of social media and t...,"['ana', 'paula', 'zacarias', ',', 'president-i...","['ana', 'paula', 'zacarias', ',', 'president-i..."
1,1,EU03_European strategy for data - Commission e...,"['henna', 'virkkunen', '(', 'ppe', ')', '.', '...","['henna', 'virkkunen', '(', 'ppe', ')', '.', '..."
2,1,EU11_Digital Europe programme.txt,"['valter', 'flego', ',', 'reporter', '.', '-',...","['valter', 'flego', ',', 'reporter', '.', '-',..."
3,1,"EU13_Artificial intelligence in education, cul...","['sabine', 'verheyen', ',', 'rapporteur', '.',...","['sabine', 'verheyen', ',', 'rapporteur', '.',..."
4,1,EU14_Digital future of Europe- digital single ...,"['deirdre', 'clune', ',', 'rapporteur', '.', '...","['deirdre', 'clune', ',', 'rapporteur', '.', '..."


In [45]:
# Save processed dataframe for further analysis
os.chdir('/Users/charlottekaiser/Documents/uni/Hertie/master_thesis/00_data/50_analysis')
df.to_csv('ready_corpus_aggregate.csv')