# Exploratory Data Analysis

In [None]:
# Uncomment to install required packages for the notebook:
#! pip install gensim==4.3.1
#! pip install matplotlib==3.3.2
#! pip install pandas==2.0.3
#! pip install scikit-learn==1.0.2
#! pip install seaborn==0.11.0

In [None]:
import os

# Get the notebook directory
notebook_dir = os.getcwd()

# Get the root directory by navigating upwards two levels
root_dir = os.path.dirname(os.path.abspath(os.path.join(notebook_dir, '../../')))

# Change the current working directory to the root directory
os.chdir(root_dir)

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import gensim
from collections import Counter
from collections import defaultdict

In [None]:
from src_clean.preprocessing.text_preprocessing import preprocess_text
from src_clean.eda.visualizations import plot_distribution, plot_most_common, plot_wordcloud_grid

In [None]:
data_dir = 'data/question_answer/questions.csv'
df_all = pd.read_csv(data_dir)

In [None]:
df_all.head()

In [None]:
df_all.info()

In [None]:
questions = df_all['Question']

In [None]:
answers = df_all['Answer']

# Pre-processing

In this section, the questions and answers are pre-processed. The pre-processing step is needed, so that the analysis is performed on data which has less noise (such as stopwords, punctuation etc.)

### Tokenize function

In [None]:
def tokenize(text):
    """
        Tokenizes the input text.
        Input: text - type(str)
        Output: a list of tokens - type(list)
    """
    tokens = word_tokenize(text, language='dutch')
    return tokens

## Lengths, Distributions, etc.

### questions

In [None]:
preprocessed_questions = questions.apply(lambda x: preprocess_text(str(x),stem=False,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

questions_tokenized = preprocessed_questions.apply(lambda x: tokenize(str(x)))
questions_len = questions_tokenized.apply(lambda x: len(x))

In [None]:
plot_distribution(questions_len, 'Question Length', 'Questions', 'Distribution of Question Lengths')

### Answers

In [None]:
preprocessed_answers = answers.apply(lambda x: preprocess_text(str(x),stem=False,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

answers_tokenized = preprocessed_answers.apply(lambda x: tokenize(str(x)))
answers_len = answers_tokenized.apply(lambda x: len(x))

In [None]:
plot_distribution(answers_len, 'Answer Length', 'Answers', 'Distribution of Answer Lengths')

## Word Frequencies

#### Questions

In [None]:
corpus_q = sum(questions_tokenized, [])
plot_most_common(corpus_q, 'Question', top_n=20)

#### Answers 

In [None]:
corpus_a = sum(answers_tokenized, [])
plot_most_common(corpus_a, 'Answer', top_n=20)

#### n-grams

In [None]:
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]

In [None]:
top_n_bigrams=get_top_ngram(corpus_q,3)
x,y=map(list,zip(*top_n_bigrams)) 
sns.barplot(x=y,y=x)

In [None]:
top_n_bigrams=get_top_ngram(corpus_a,2)
x,y=map(list,zip(*top_n_bigrams)) 
sns.barplot(x=y,y=x)

#### Questions

In [None]:
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,3),
                             max_df=0.6,
                             min_df=3)

# Fit-transform the questions
tfidf_q = vectorizer.fit_transform(preprocessed_questions)


In [None]:
feature_names = vectorizer.get_feature_names_out()
tf_idf_sum = tfidf_q.sum(axis=0)
tf_idf_scores = [(feature_names[i], tf_idf_sum[0, i]) for i in range(len(feature_names))]
tf_idf_scores = sorted(tf_idf_scores, key=lambda x: x[1], reverse=True)

# Print the top 10 most important words in the corpus
for term, score in tf_idf_scores[:20]:
    print(f"{term}: {score:.2f}")


#### Answers

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,3),
                             max_df=0.6,
                             min_df=3)
tfidf_a = vectorizer.fit_transform(preprocessed_answers)

In [None]:
feature_names = vectorizer.get_feature_names_out()
tf_idf_sum = tfidf_a.sum(axis=0)
tf_idf_scores = [(feature_names[i], tf_idf_sum[0, i]) for i in range(len(feature_names))]
tf_idf_scores = sorted(tf_idf_scores, key=lambda x: x[1], reverse=True)

# Print the top 10 most important words in the corpus
for term, score in tf_idf_scores[:20]:
    print(f"{term}: {score:.2f}")



## Topic Modeling 

##### Remove some additional frequent words to make the topics more clear

In [None]:
words_to_remove = ['welke',
 'amsterdam',
 'waarom',
 'gemeente',
 'nee',
 'hoeveel',
 'bereid',
 'college',
 'gaat',
 'bekend',
 'graag',
 'fractie',
 'aangeven',
 'mening',
 'amsterdamse',
 'toelichting',
 'klopt',
 'gaan',
 'mogelijk',
 'footnotestart',
 'footnoteend']

def remove_stopwords(words, stopwords):
    return [word for word in words if word not in stopwords]

questions_tokenized = questions_tokenized.apply(lambda x: remove_stopwords(x, words_to_remove))

def remove_short_words(word_list):
    return list(filter(lambda word: len(word) > 3, word_list))

questions_tokenized = questions_tokenized.apply(lambda word_list: remove_short_words(word_list))

### Questions

In [None]:
from wordcloud import WordCloud
import math
from matplotlib import colors as mcolors

In [None]:
def plot_wordcloud_grid(lda_model, num_topics, num_words, ncols, width=4, height=3):
    """
    Create a grid of word clouds for multiple topics from an LDA model.
    
    Parameters:
    - lda_model: Trained LDA model.
    - num_topics: Number of topics to display.
    - num_words: Number of top words to include in each topic's word cloud.
    - ncols: Number of columns in the grid.
    - width: Width of each word cloud plot (default: 4).
    - height: Height of each word cloud plot (default: 4).
    """
    nb_rows = math.ceil(num_topics / ncols)
    
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 
    cols = cols * math.ceil(num_topics / len(cols))

    cloud = WordCloud(background_color='white',
                      width=400,
                      height=400,
                      max_words=num_words,
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)

    topics = lda_model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)

    fig, axes = plt.subplots(ncols=ncols, nrows=nb_rows, 
                             figsize=(width*ncols, height*nb_rows), 
                             sharex=True, sharey=True)

    for i, (topic, ax) in enumerate(zip(topics, axes.flatten())):
        topic_words = dict(topic[1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        ax.imshow(cloud)
        #ax.set_title('Topic ' + str(i), fontdict=dict(size=16))
        ax.axis('off')

    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()

In [None]:
dic=gensim.corpora.Dictionary(questions_tokenized)
bow_corpus = [dic.doc2bow(doc) for doc in questions_tokenized]

lda_model_q = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 18,
                                   id2word = dic,
                                   passes = 10,
                                   workers = 2, 
                                        random_state=30)

num_topics = 18
num_words = 10
ncols = 6

plot_wordcloud_grid(lda_model_q, num_topics, num_words, ncols)


### Answers

In [None]:
dic=gensim.corpora.Dictionary(answers_tokenized)
bow_corpus = [dic.doc2bow(doc) for doc in answers_tokenized]

lda_model_a = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 20,
                                   id2word = dic,
                                   passes = 10,
                                   workers = 2, 
                                        random_state=30)

num_topics = 18
num_words = 10
ncols = 6

plot_wordcloud_grid(lda_model_a, num_topics, num_words, ncols)


### Locating Outliers 

#### Questions

In [None]:
df_all['Question Len'] = questions_len
df_all['Answer Len'] = answers_len

In [None]:
df_all.describe()

Mean is very different from the max values, which indicates outliers, regardless of the fact that mean is highly affected by extreme values. 

In [None]:
def find_outliers_IQR(df):
    q1=df.quantile(0.25)
    
    q3=df.quantile(0.75)
    
    IQR=q3-q1
    
    outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
    
    return outliers

In [None]:
outliers_q = find_outliers_IQR(df_all['Question Len']).sort_values()

In [None]:
outliers_a = find_outliers_IQR(df_all['Answer Len']).sort_values()

In [None]:
print(len(outliers_q))
print(len(outliers_a))

## How do questions start

In [None]:
preprocessed_questions_no_stem = questions.apply(lambda x: preprocess_text(str(x),stem=False,
                                                                                      remove_stopwords=False,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 
questions_tokenized_no_stem = preprocessed_questions_no_stem.apply(lambda x: tokenize(str(x)))

In [None]:
q_start = [q[0] for q in questions_tokenized_no_stem]
counted = Counter(q_start)
x, y = zip(*counted.most_common(25))

plt.figure(figsize=(10, 6))
sns.barplot(x=list(y), y=list(x), color='red')
plt.xlabel('Frequency')
plt.ylabel('Question Start')
plt.title('Most Common Question Starts')
plt.show()


## Counting Opinion and Factual Words

In [None]:
opinion_words = {
    'mening': 0, 'convictie': 0, 'denkbeeld': 0, 'denkwijs': 0, 'denkwijze': 0, 'dunk': 0,
    'gedacht': 0, 'gedachte': 0, 'geest': 0, 'gevoelen': 0, 'gezindheid': 0, 'idee': 0,
    'inzicht': 0, 'inzien': 0, 'kijk': 0, 'oordeel': 0, 'opinie': 0, 'bevindingen': 0,
    'besluiten': 0, 'beslissend': 0, 'stellingname': 0, 'visie': 0, 'zienswijze': 0,
    'zin': 0, 'bekend': 0, 'college': 0, 'vindt': 0,
    'standpunt': 0, 'bereid': 0, 'kennisgenomen': 0
}

opinion_counts = defaultdict(int)

for q in questions_tokenized_no_stem:
    found_opinions = [w for w in opinion_words if w in q]
    for opinion in found_opinions:
        opinion_counts[opinion] += 1

opinion_counts = dict(opinion_counts)
print(opinion_counts)


In [None]:
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
            
        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))