In [None]:
# Import relevant general libraries for (regex, time, dicts, math etc)
import re
import ast
import datetime

import numpy as np
import pandas as pd

from collections import defaultdict
from datetime import timedelta
from datetime import datetime
from dateutil.relativedelta import relativedelta

# NLTK stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Transformer import
from sentence_transformers import SentenceTransformer

# Cosine and Euclidean distance metrics libraries
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from scipy.spatial.distance import euclidean

# Popular graphing and community detection libraries
import networkx as nx
import community.community_louvain as community_louvain

# TFIDF vectorizer library
from sklearn.feature_extraction.text import TfidfVectorizer

# Graphing with matplotlib + Plotly
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.graph_objects as go

## Colab Jupyter widgets
from IPython.display import display
from google.colab import output
from ipywidgets import widgets

In [None]:
stopwords_dutch = stopwords.words('dutch')

In [None]:
# Load data sets.
df_communities = pd.read_csv('df_communities.csv')
df_text = pd.read_csv('kamers_text.csv')

# Load the word embeddings as np array.
embeddings = np.load('embeddings.npy')
community_ids = np.load('community_ids.npy', allow_pickle=True)


In [None]:
# Merge df_entities and df_kamers on the 'id' column
merged_df = df_communities.merge(df_text[['id', 'text']], on='id', how='left')

# The 'text' column from df_kamers is appended to df_entities
df_communities['text'] = merged_df['text']

In [None]:
# Make sure the date column is in Pandas DateTime
df_communities['date'] = pd.to_datetime(df_communities['date'], errors='coerce', utc=True)
df_communities['date'] = pd.to_datetime(df_communities['date']).dt.date

In [None]:
# Keep only unique instances of documents for the embedding-based query search.
df_unique = df_communities.drop_duplicates(subset='id', keep='first')

In [None]:
# Initialize the SentenceTransformer with which we can convert text into word embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# The search function which converts a search string into word embeddings and then finds distances to pre-calculated embeddings
def search(query, num_results=5):
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()[0]
    distances = [euclidean(query_embedding, embedding) for embedding in embeddings]
    indices = np.argsort(distances)[:num_results]

    # Get the community_ids corresponding to the closest embeddings
    closest_communities = community_ids[indices]

    return closest_communities

In [None]:
search_string = 'fraude'
search_results = search(search_string, num_results=5)
print(search_results)

['C527' 'C1497' 'C3346' 'C7539' 'C8965']


In [None]:
##
## Graph all communities together, with edges representing entity flow between communities of at least 1.0
## More entity flow results in a higher edge weight.
##

# Initialize an empty Graph
G_communities = nx.Graph()

# Prepare a dictionary to store current community of each entity
entity_current_community = {}

# Loop through sorted dataframe and track community changes
for _, row in df_communities.sort_values('date').iterrows():
    entity, community = row['entity'], row['community']

    # Check for community change
    if entity in entity_current_community and entity_current_community[entity] != community:
        # Get the previous community and current community as source and target
        source = entity_current_community[entity]
        target = community

        if G_communities.has_edge(source, target):
            # If the edge already exists, increase its weight
            G_communities[source][target]['weight'] += 1
        else:
            # If the edge doesn't exist, add it with weight 1
            G_communities.add_edge(source, target, weight=1)

    # Update the current community of an entity
    entity_current_community[entity] = community

# Set custom amount of K based on network size.
k = 4 / np.sqrt(G_communities.order())
k *= 1.5

In [None]:
##
## Perform personalized pagerank; seed nodes are assigned 1.0 after which a random walk algorithm
## will indicate which nodes are frequently traversed, which can be used a measure of relevance for communities.
## This way, additional relevant communities can be found.
##

# Create a personalization dict: assign each community a probability.
# Communities in search_results are assigned a value of 1, others get value 0.
personalization = {}
for node in G_communities.nodes():
    personalization[node] = 1.0 if node in search_results else 0.0

# Ensure that at least one community in search_results exists in the graph.
if sum(personalization.values()) == 0.0:
    raise ValueError("None of the communities in search_results are in the graph.")

# Normalize the personalization dict (so it forms a valid probability distribution)
total = sum(personalization.values())
for node in personalization:
    personalization[node] /= total

# Perform the PageRank calculation
pagerank_values = nx.pagerank(G_communities, personalization=personalization)

In [None]:
# Sort the pagerank_values in descending order then display the top-20.
sorted_pagerank_values = sorted(pagerank_values.items(), key=lambda item: item[1], reverse=True)
sorted_pagerank_values[1:20]

[('C8965', 0.031690881497516775),
 ('C7539', 0.030796919772521813),
 ('C3346', 0.03059032835037983),
 ('C527', 0.030172028244745665),
 ('C585', 0.021191392115844112),
 ('C7323', 0.01637203392582316),
 ('C7576', 0.013884431995628612),
 ('C2176', 0.010090078811750227),
 ('C897', 0.009563170343816917),
 ('C3710', 0.009169387415982196),
 ('C353', 0.009031336955521704),
 ('C2542', 0.008932006461021814),
 ('C2678', 0.008738231639434466),
 ('C1027', 0.007265884451986602),
 ('C1523', 0.006915129946161994),
 ('C5927', 0.005743070247994699),
 ('C640', 0.005521322838273432),
 ('C224', 0.0048811737181798445),
 ('C7256', 0.0043930747723808315)]

In [None]:
# Get the community names from sorted_pagerank_values
community_names = [t[0] for t in sorted_pagerank_values]

# Filter out the communities that are already in search_results
border_communities = [name for name in community_names if name not in search_results]

# Take the top five communities from filtered_communities
top_5_border = border_communities[:5]

# Concatenate search_results and top_5_border
relevant_communities = np.concatenate((search_results, top_5_border))


In [None]:
relevant_communities

array(['C527', 'C1497', 'C3346', 'C7539', 'C8965', 'C585', 'C7323',
       'C7576', 'C2176', 'C897'], dtype=object)

In [None]:
# Create a list of matched communities
matched_communities = relevant_communities.tolist()

# Filter the original DataFrame to only include documents from the matched communities
filtered_df_unique = df_unique[df_unique['community'].isin(matched_communities)]

top_communities = filtered_df_unique['community'].value_counts().index[:10]

# Filter to include only the top 20 communities
filtered_results_top_communities = filtered_df_unique[filtered_df_unique['community'].isin(top_communities)]

# Fit and transform the text with TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords_dutch)
X = vectorizer.fit_transform(filtered_results_top_communities['text'])

# Create a DataFrame with the TF-IDF scores
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=filtered_results_top_communities['id'])

# Add the community to the TF-IDF DataFrame
df_tfidf['community'] = filtered_results_top_communities['community'].values

# Compute a relevance score for each document as the sum of its TF-IDF scores
df_tfidf['relevance_score'] = df_tfidf.drop(columns='community').sum(axis=1)

# Get the top 3 most relevant documents in each community
most_relevant_docs = df_tfidf.groupby('community')['relevance_score'].nlargest(3)

# Convert the multi-index Series to a DataFrame and reset the index for easier processing
most_relevant_docs = most_relevant_docs.to_frame().reset_index()

# Print the document IDs and links
for idx, row in most_relevant_docs.iterrows():
    community, doc_id = row['community'], row['id']
    community_name = df_communities.loc[df_communities['community'] == community, 'community_name'].values[0]
    doc_link = f"https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document({doc_id})/resource"

    print(f"Community: {community}, Community Name: {community_name}, Document ID: {doc_id}, Link: {doc_link}\n")


Community: C1497, Community Name: Zorggeld, Sjerp ndstkD, Document ID: a02b840e-6864-413e-8b9d-d9db5ce7eef3, Link: https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document(a02b840e-6864-413e-8b9d-d9db5ce7eef3)/resource

Community: C2176, Community Name: Van der Staaij, Van Gerven, Van Dekken, Document ID: 80b93820-c602-42bf-bf2b-8fbd3132aca0, Link: https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document(80b93820-c602-42bf-bf2b-8fbd3132aca0)/resource

Community: C2176, Community Name: Van der Staaij, Van Gerven, Van Dekken, Document ID: 12f924b1-6d3d-4d85-9201-b967bd73b8cb, Link: https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document(12f924b1-6d3d-4d85-9201-b967bd73b8cb)/resource

Community: C2176, Community Name: Van der Staaij, Van Gerven, Van Dekken, Document ID: 548c3041-50a1-4623-8893-c51b5f03d4f5, Link: https://gegevensmagazijn.tweedekamer.nl/OData/v4/2.0/Document(548c3041-50a1-4623-8893-c51b5f03d4f5)/resource

Community: C3346, Community Name: Calibris, Deelconc