# Load data from preprocessing notebook

In [14]:
import pickle

with open('WL_sent_filtered.pkl', 'rb') as f:
    WL_sent_filtered = pickle.load(f)

with open('all_words.pkl', 'rb') as f:
    all_words = pickle.load(f)

#with open('word_counts.pkl', 'rb') as f:
#    word_counts = pickle.load(f)


# Creating Bigrams

In [15]:
################# I use sliding-window co-occurrence, respecting sentence boundaries, for this is Word2Vec-like and should capture both syntactic and semantic structure without being too computationally heavy. 

window_size = 5

bigrams = []

for sent in WL_sent_filtered:
    for i, word in enumerate(sent):
        for j in range(max(0, i - window_size), min(len(sent), i + window_size + 1)): # looks at the window_size words before and after the word in position i, while staying within the sentence boundaries
            if i != j: # exclude the target word itself from its window
                bigrams.append((word, sent[j]))
                

print(bigrams[:10])
print(len(bigrams))

[('verfasser', 'unter'), ('verfasser', 'wissenschaftslehre'), ('verfasser', 'verstehe'), ('unter', 'verfasser'), ('unter', 'wissenschaftslehre'), ('unter', 'verstehe'), ('wissenschaftslehre', 'verfasser'), ('wissenschaftslehre', 'unter'), ('wissenschaftslehre', 'verstehe'), ('verstehe', 'verfasser')]
3039692


# Filter bigrams for frequency

In [16]:
minfreq = 3 # Gives 219534, which seems like a good number

from collections import Counter

bigrams_count = Counter(bigrams) # = Dictionary-like object

freq_bigrams = []

for bigram, count in bigrams_count.items():
    if count >= minfreq:
        freq_bigrams.append(bigram)

print(len(freq_bigrams))
print(list(freq_bigrams)[:20])

219534
[('verfasser', 'unter'), ('verfasser', 'verstehe'), ('unter', 'verfasser'), ('unter', 'wissenschaftslehre'), ('unter', 'verstehe'), ('wissenschaftslehre', 'unter'), ('wissenschaftslehre', 'verstehe'), ('verstehe', 'verfasser'), ('verstehe', 'unter'), ('verstehe', 'wissenschaftslehre'), ('vorstelle', 'alle'), ('alle', 'vorstelle'), ('alle', 'wahrheiten'), ('alle', 'mensch'), ('alle', 'kennt'), ('wahrheiten', 'alle'), ('wahrheiten', 'hat'), ('mensch', 'alle'), ('mensch', 'hat'), ('kennt', 'alle')]


# Create matrix

In [21]:
##################### Create index

word2idx = {}
for i, word in enumerate(all_words):
    word2idx[word] = i

vocab_size = len(all_words)

################### Create matrix. Sparse for computability. 

from scipy.sparse import lil_matrix

cooc_mat = lil_matrix((vocab_size, vocab_size), dtype=int)

for bigram in freq_bigrams:
    w1 = bigram[0]
    w2 = bigram[1]
    
    if w1 in word2idx and w2 in word2idx:
        i = word2idx[w1]
        j = word2idx[w2]
        cooc_mat[i, j] += 1 # counts how many times word_i co-occurs with word_j

# Convert to csr format for efficiency
cooc_mat = cooc_mat.tocsr()

################## Apply dimensionality reduction (Truncated SVD)
# Truncated SVD = Singular Value Decomposition (breaking matrix into orthogonal components that capture the main patterns in the data), keeping only
# the top k singular values/components (=n_components), which gives a dense, low-dimensional embedding for each word

from sklearn.decomposition import TruncatedSVD

n_components = 100  # choose embedding size
svd = TruncatedSVD(n_components=n_components)

WL_embedding = svd.fit_transform(cooc_mat)

# Inspecting the embedding

In [24]:
################### Show vector for target word (mind: lower case!)

target_word = "wissenschaft"

idx = word2idx[target_word]
print("Vector for", target_word, ":", WL_embedding[idx])

Vector for wissenschaft : [19.76114762 -7.42731554 -0.13690904  6.67732679  0.96853948 -1.37987829
  0.36439024  4.25679079  0.26488511  5.4158889  -3.19272996 -1.63554292
  0.11766309  0.54437407 -1.13362506  1.5867999   1.72924777  0.59082688
  3.42506758 -1.78426044 -7.1678819  -1.77675753  0.91274625 -0.38087625
  0.90779784  0.94896927 -0.65997781 -2.28736498  2.22346766 -1.59176328
 -0.1583982   1.8878968  -1.73249522  0.09682568 -2.96105938 -1.67580532
  0.75513875 -4.27838253  3.01867792  1.70657818  2.21885707 -0.21780456
 -0.11182947  0.88032052 -0.79345423  2.14622736  0.72860059  0.89635395
 -1.90096562 -0.52410769 -1.87932803 -2.46130842  1.60133148 -0.9364986
 -0.40779105 -1.41799584  1.14830668 -0.75978281 -0.11391834  0.80148087
 -0.06619663  2.46513386  2.78842432 -0.55329733  0.27847198  0.98722244
  1.12527973  2.02053962 -2.22393926  0.67640507  1.87250182  0.98808145
  1.80602033 -2.04007604  1.34011922 -1.1997673   0.90227378 -0.7580385
 -0.7449035  -0.10905966 -1

In [28]:
################# Find similar words (using cosine similarity)

from sklearn.metrics.pairwise import cosine_similarity

similar = cosine_similarity([WL_embedding[idx]], WL_embedding)[0]

##### Exclude the word itself
similar[idx] = -1  # or any value lower than possible similarities

####### Get top 5 closest words

top_idx = []

for i in range(5):
    max_sim = max(similar)
    max_idx = similar.tolist().index(max_sim)
    top_idx.append(max_idx)
    similar[max_idx] = -1  #  mark as used

print("Top 5 closest words:", [all_words[i] for i in top_idx])

Top 5 closest words: ['wahrheiten', 'lehren', 'finden', 'logik', 'jeden']
