In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import os
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter
import seaborn as sns
import string
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
import gensim

In [None]:
import math
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

In [None]:
df_all = pd.read_csv('questions - questions-2.csv')

In [None]:
df_trans_links = pd.read_csv('Questions_links_translations.csv')

In [None]:
df_all.head()

In [None]:
df_trans_links.head()

In [None]:
questions = df_all['Question']

In [None]:
answers = df_all['Answer']

### Pre-processing

In [None]:
stop = stopwords.words('dutch')

In [None]:
def tokenize(text):
    """
        Tokenizes the input text.
        Input: text - type(str)
        Output: a list of tokens - type(list)
    """
    tokens = word_tokenize(text, language='dutch')
    return tokens

In [None]:
def stem_token(token): # Doesn't work rn, should check how to do for Dutch
    """
        Stems the given token using the PorterStemmer from the nltk library
        Input: a single token
        Output: the stem of the token
    """
    stemmer = PorterStemmer()
    return stemmer.stem(token)

In [None]:
def process_text(text, stop_words, stem=False, remove_stopwords=False, lowercase_text=False, remove_punct=False):
    """
    Given a string, the function tokenizes
    it and processes it according to the set requirements.
    """
    tokens = []
    for token in tokenize(text):
        if remove_stopwords and token.lower() in stop_words:
            continue
        if remove_punct and token.isdigit():
            continue
        if token in string.punctuation:
            continue
        if len(token) < 2:
            continue
        if stem:
            token = stem_token(token)
        if lowercase_text:
            token = token.lower()
        tokens.append(token)

    return tokens

In [None]:
process_text(questions[0], stop, stem=False, remove_stopwords=True, lowercase_text=True, remove_punct=True)

In [None]:
questions_prepr = []
questions_len = []
for question in questions:
    preprocessed = process_text(question, stop, remove_stopwords=True, lowercase_text=True, remove_punct=True)
    questions_prepr.append(preprocessed)
    questions_len.append(len(preprocessed))

In [None]:
dic=gensim.corpora.Dictionary(questions_prepr)
bow_corpus = [dic.doc2bow(doc) for doc in questions_prepr]

In [None]:
lda_model_q = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 20,
                                   id2word = dic,
                                   passes = 10,
                                   workers = 2, 
                                        random_state=30)

In [None]:
lda_model_q.show_topics(20)

In [None]:
K=20
nb_columns = 5
nb_rows = math.ceil(K / nb_columns)

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 
cols = cols * math.ceil(K / len(cols))

cloud = WordCloud(background_color='white',
                  width=400,
                  height=400,
                  max_words=10,
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model_q.show_topics(num_topics=K, num_words=10, formatted=False)

fig, axes = plt.subplots(ncols=nb_columns, nrows=nb_rows, 
                         figsize=(4*nb_columns, 4*nb_rows), 
                         sharex=True, sharey=True)

for i, (topic, ax) in enumerate(zip(topics, axes.flatten())):
    topic_words = dict(topic[1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    ax.imshow(cloud)
    #ax.set_title('Topic ' + str(i), fontdict=dict(size=16))
    ax.axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
answers_prepr = []
answers_len = []
for answer in answers:
    preprocessed = process_text(str(answer), stop, remove_stopwords=True, lowercase_text=True, remove_punct=True)
    answers_prepr.append(preprocessed)
    answers_len.append(len(preprocessed))

In [None]:
dic=gensim.corpora.Dictionary(answers_prepr)
bow_corpus = [dic.doc2bow(doc) for doc in answers_prepr]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 30,
                                   id2word = dic,
                                   passes = 10,
                                   workers = 2, 
                                      random_state=42)

In [None]:
lda_model.show_topics(30)

In [None]:
K=30
nb_columns = 5
nb_rows = math.ceil(K / nb_columns)

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
cols = cols * math.ceil(K / len(cols))

cloud = WordCloud(background_color='white',
                  width=400,
                  height=400,
                  max_words=10,
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(num_topics=K, num_words=10, formatted=False)

fig, axes = plt.subplots(ncols=nb_columns, nrows=nb_rows, 
                         figsize=(4*nb_columns, 4*nb_rows), 
                         sharex=True, sharey=True)

for i, (topic, ax) in enumerate(zip(topics, axes.flatten())):
    topic_words = dict(topic[1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    ax.imshow(cloud)
    ax.set_title('Topic ' + str(i), fontdict=dict(size=16))
    ax.axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
question_types = {'wie': 0,
                 'wat': 0,
                 'waar': 0,
                 'waneer': 0,
                 'waarom': 0,
                 'welke': 0}

In [None]:
for q in questions_prepr:
    for w in q:
        if w in question_types:
            question_types[w]+=1

In [None]:
question_types

Is there a reason why 'wie' and 'wat'are stopwords? Is wie like in German?

In [None]:
opinion_words = {'mening': 0,
                 'convictie': 0, 
                 'denkbeeld': 0,
                 'denkwijs': 0,
                 'denkwijze': 0,
                 'dunk':0,
                 'gedacht':0,
                 'gedachte':0,
                 'geest': 0,
                 'gevoelen':0,
                 'gezindheid': 0,
                 'idee': 0,
                 'inzicht':0,
                 'inzien':0,
                 'kijk':0, 
                 'oordeel':0,
                 'opinie':0,
                 'bevindingen':0,
                 'besluiten':0,
                 'beslissend':0,
                 'stellingname':0,
                 'visie':0,
                 'zienswijze':0,
                 'zin':0,
                'bekend':0,
                'college':0,
                'vindt':0}

In [None]:
for q in questions_prepr:
    for w in q:
        if w in opinion_words:
            opinion_words[w]+=1

In [None]:
opinion_words

In [None]:
opinion_words = {'mening': 0,
                 'convictie': 0, 
                 'denkbeeld': 0,
                 'denkwijs': 0,
                 'denkwijze': 0,
                 'dunk':0,
                 'gedacht':0,
                 'gedachte':0,
                 'geest': 0,
                 'gevoelen':0,
                 'gezindheid': 0,
                 'idee': 0,
                 'inzicht':0,
                 'inzien':0,
                 'kijk':0, 
                 'oordeel':0,
                 'opinie':0,
                 'bevindingen':0,
                 'besluiten':0,
                 'beslissend':0,
                 'stellingname':0,
                 'visie':0,
                 'zienswijze':0,
                 'zin':0,
                'bekend':0,
                'college':0,
                'vindt':0}
for q in questions_prepr:
    for w in opinion_words:
        if w in q: 
            opinion_words[w]+=1
opinion_words

In [None]:
opinion_words_links = {'mening': 0,
                 'convictie': 0, 
                 'denkbeeld': 0,
                 'denkwijs': 0,
                 'denkwijze': 0,
                 'dunk':0,
                 'gedacht':0,
                 'gedachte':0,
                 'geest': 0,
                 'gevoelen':0,
                 'gezindheid': 0,
                 'idee': 0,
                 'inzicht':0,
                 'inzien':0,
                 'kijk':0, 
                 'oordeel':0,
                 'opinie':0,
                 'bevindingen':0,
                 'besluiten':0,
                 'beslissend':0,
                 'stellingname':0,
                 'visie':0,
                 'zienswijze':0,
                 'zin':0,
                'bekend':0,
                'college':0,
                'vindt':0,
                'standpunt':0,
                'bereid':0,
                'kennisgenomen':0}

In [None]:
questions_prepr_stop_url = []
for question in df_trans_links['Question']:
    preprocessed = process_text(question, stop, remove_stopwords=False, lowercase_text=True)
    questions_prepr_stop_url.append(preprocessed)

In [None]:
for q in questions_prepr_stop_url:
    for w in q:
        if w in opinion_words:
            opinion_words_links[w]+=1

In [None]:
opinion_words_links

### check n-grams

In [None]:
questions_prepr_stop = []
for question in questions:
    preprocessed = process_text(question, stop, remove_stopwords=False, lowercase_text=True, remove_punct=True)
    questions_prepr_stop.append(preprocessed)

In [None]:
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

In [None]:
corpus_q = sum(questions_prepr_stop, [])

In [None]:
top_n_trigrams=get_top_ngram(corpus_q,3)
x,y=map(list,zip(*top_n_trigrams[:10])) 
sns.barplot(x=y,y=x)

In [None]:
top_n_trigrams=get_top_ngram(corpus_q,3)
top_n_trigrams

In [None]:
top_n_bigrams=get_top_ngram(corpus_q,2)
x,y=map(list,zip(*top_n_bigrams)) 
sns.barplot(x=y,y=x)

In [None]:
top_n_bigrams=get_top_ngram(corpus_q,2)
top_n_bigrams

In [None]:
question_types_stop = {'wie': 0,
                 'wat': 0,
                 'waar': 0,
                 'waneer': 0,
                 'waarom': 0,
                 'welke': 0}

In [None]:
for q in questions_prepr_stop:
    for w in q:
        if w in question_types_stop:
            question_types[w]+=1

In [None]:
question_types

In [None]:
q_start = []
for q in questions_prepr_stop:
    q_start.append(q[0])

In [None]:
counted = Counter(q_start)

In [None]:
x

In [None]:
x,y=map(list,zip(*counted.most_common(25))) 
sns.barplot(x=y,y=x, color='red')

In [None]:
q_start_2 = []
for q in questions_prepr_stop:
    q_start_2.append(q[2])
counted2 = Counter(q_start_2)
counted2.most_common()