In [98]:
# Import the required modules
import os
import numpy as np
import pandas as pd
import igraph as ig
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.metrics.pairwise import cosine_similarity

import src.functions as functions

In [143]:
start_year = 1950
end_year = 1990
#folder=OUTPUT_FOLDER
language = 'ger'
vocabulary = None
verbose = False
percentile = 97.3
word = 'dokument'
node_label = word
save_fig = True and False

In [144]:
vocabulary=pd.read_pickle(os.path.join('data', 'processed', 'concreteness_ger.pkl')).index

In [146]:
embeddings_list = {
    "eng": os.path.join("eng-all_sgns", "sgns"),
    "fra": "fre-all_sgns",
    "ger": "ger-all_sgns"
}
# iterate decades
for year in range(start_year, end_year+10, 10):
    if verbose:
        print("Year:", year)
    # Load the matrix and vocabulary for the current year
    mat = functions.load_mat(year,
                             language_folder=embeddings_list[language])
    vocab = functions.load_vocab(year,
                                 language_folder=embeddings_list[language])

    if verbose:
        print("Ratio of non-zero vectors:", functions.check_sparcity(mat))
    # Remove zero vectors from matrix
    reduced_mat, reduced_vocab = functions.remove_empty_words(mat, vocab)
    # if vocabulary is passed, remove all other words
    if vocabulary is not None:
        reduced_mat, reduced_vocab = functions.reduce_to_list_of_words(
            reduced_mat, reduced_vocab, vocabulary)
    # Calculate the cosine similarity between all word-vectors
    cos_sim = cosine_similarity(reduced_mat)
    #break

    # find cos_sim cutoff value, depending on percentile
    np.fill_diagonal(cos_sim, 0.)
    cutoff_value = np.percentile(cos_sim, percentile)
    if verbose:
        print("Cutoff value:", cutoff_value)
    # create adjacency matric
    above_thresh = np.where(cos_sim >= cutoff_value, 1, 0)

    graph = ig.Graph.Adjacency(above_thresh, mode='lower')

    # Set the node labels using the numpy array
    graph.vs['label'] = reduced_vocab.tolist()



#def plot_node_and_neighbors_by_label(graph, node_label):
    try:
        node_index = graph.vs['label'].index(node_label)
    except ValueError:
        print(f"Node label '{node_label}' not found.")
        #return

    neighbors = graph.neighbors(node_index)
    subgraph_indices = [node_index] + neighbors
    subgraph = graph.subgraph(subgraph_indices)

    layout = subgraph.layout('kk')

    edge_x = []
    edge_y = []
    for edge in subgraph.es:
        source, target = edge.tuple
        x0, y0 = layout[source]
        x1, y1 = layout[target]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        mode='lines',
        line=dict(color='rgba(0, 0, 0, 0.3)', width=1),
        hoverinfo='none'
    )

    node_x = [coords[0] for coords in layout]
    node_y = [coords[1] for coords in layout]

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=subgraph.vs['label'],
        textposition='top center',
        marker=dict(size=20, color='lightblue', line=dict(color='black', width=1)),
        hoverinfo='text'
    )

    fig = go.Figure(data=[edge_trace, node_trace])
    fig.update_layout(showlegend=False, hovermode='closest')
    fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False)
    fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False)
    fig.show()
    if save_fig:
        pio.write_image(fig, f'figures/graph/subgraph_{word}_{percentile}_{year}.png')
    

In [109]:
'dokument' in vocab

True

In [120]:
for year in range(start_year, end_year+10, 10):
    if save_fig and year == 1980:
        print(year)

1980


True