In [1]:
import math
import numpy as np
import pandas as pd
import pickle
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy import spatial

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora

In [2]:
LAGTec = pd.read_json("../data/large_files/LAGTec.json")

In [3]:
len(LAGTec["author"].unique().tolist())

49

In [4]:
len(LAGTec)

148

In [5]:
LAGTec["wordcount"].sum()

3566823

In [6]:
LAGTec["lemmatized_sentences"].apply(lambda x: len([w for s in x for w in s])).sum()

1510554

In [7]:
LAGTec_by_date = pd.DataFrame(LAGTec.groupby("date_avr").sum(numeric_only=True))
LAGTec_by_date.reset_index(inplace=True)

LAGTec_by_date["authors_N"] = LAGTec_by_date["date_avr"].apply(lambda x: len(LAGTec[LAGTec["date_avr"]==x]["author"].unique()))

LAGTec_by_date["works_N"] = LAGTec_by_date["date_avr"].apply(lambda x: (LAGTec["date_avr"]==x).sum())

LAGTec_by_date["paul_freq"] = LAGTec_by_date["paul_N"] / LAGTec_by_date["wordcount"]

LAGTec_by_date[["date_avr", "authors_N", "wordcount", "n_sentences", "paul_N", "paul_freq", "sentences_paul_N", "sentences_apostle_N"]]

Unnamed: 0,date_avr,authors_N,wordcount,n_sentences,paul_N,paul_freq,sentences_paul_N,sentences_apostle_N
0,0.5,13,151723,10108,160,0.001055,156,84
1,1.5,13,322401,41076,18,5.6e-05,18,92
2,2.5,4,1237867,129474,497,0.000401,485,654
3,3.5,9,1347705,117599,213,0.000158,205,690
4,4.5,10,507127,45061,184,0.000363,181,146


In [8]:
sents_1to3 = [sen for work in LAGTec[LAGTec["date_avr"]<3]["lemmatized_sentences"] for sen in work]
sents_4to5 = [sen for work in LAGTec[LAGTec["date_avr"]>3]["lemmatized_sentences"] for sen in work]

In [9]:
len(sents_1to3)

99910

In [10]:
print(sents_1to3[:3])

[['βίβλος', 'γένεσις', 'Ἰησοῦς', 'Χριστός', 'υἱός', 'Δαυίδ', 'υἱός', 'Ἀβραάμ'], ['Ἀβραάμ', 'γεννάω', 'Ἰσαάκ', 'Ἰσαάκ', 'γεννάω', 'Ἰακώβ', 'Ἰακώβ', 'γεννάω', 'Ἰούδας', 'ἀδελφός', 'Ἰούδας', 'γεννάω', 'Φάρες', 'Ζάρα', 'Θαμάρ', 'Φάρες', 'γεννάω', 'Ἑσρώμ', 'Ἑσρώμ', 'γεννάω', 'Ἀράμ', 'Ἀράμ', 'γεννάω', 'Ἀμιναδάβ', 'Ἀμιναδάβ', 'γεννάω', 'Ναασσών', 'Ναασσών', 'γεννάω', 'Σαλμών', 'Σαλμών', 'γεννάω', 'βοῦς', 'Ῥαχάβ', 'βοῦς', 'γεννάω', 'Ἰωβήδ', 'Ῥούθ', 'Ἰωβήδ', 'γεννάω', 'Ἰεσσαί', 'Ἰεσσαί', 'γεννάω', 'Δαυίδ', 'βασιλεύς'], ['Δαυίδ', 'γεννάω', 'Σολομών', 'Οὐρίας', 'Σολομών', 'γεννάω', 'Ῥοβοάμ', 'Ῥοβοάμ', 'γεννάω', 'Ἀβιά', 'Ἀβιά', 'γεννάω', 'Ἀσάφ', 'Ἀσάφ', 'γεννάω', 'Ἰωσαφάτ', 'Ἰωσαφάτ', 'γεννάω', 'Ἰωράμ', 'Ἰωράμ', 'γεννάω', 'Ὀζίας', 'Ὀζίας', 'γεννάω', 'Ἰωαθάμ', 'Ἰωαθάμ', 'γεννάω', 'ἄχας', 'ἄχας', 'γεννάω', 'Ἑζεκίας', 'Ἑζεκίας', 'γεννάω', 'Μανασσῆς', 'Μανασσῆς', 'γεννάω', 'Ἀμώς', 'Ἀμώς', 'γεννάω', 'Ἰωσίας', 'Ἰωσίας', 'γεννάω', 'Ἰεχονίας', 'ἀδελφός', 'μετοικεσία', 'Βαβυλών']]


In [11]:
min_freq = 5

def get_vocab(docs, min_freq=5):
    words_flat = [item for sublist in docs for item in sublist]
    word_freq_tups = nltk.FreqDist(words_flat).most_common()
    vocabulary = [tup[0] for tup in word_freq_tups if tup[1] >= min_freq]
    #vocab_freqs = [len([doc for doc in docs if word in doc]) for word in vocabulary]
    return words_flat, vocabulary # , vocab_freqs

In [12]:
words_flat_1to3, vocabulary_1to3 = get_vocab(sents_1to3)
words_flat_4to5, vocabulary_4to5 = get_vocab(sents_4to5)

In [13]:
print("Παῦλος" in vocabulary_1to3)
print("Παῦλος" in vocabulary_4to5)
print("ἀπόστολος" in vocabulary_1to3)
print("ἀπόστολος" in vocabulary_4to5)

True
True
True
True


In [14]:
words_flat_1to3[-100:]

['ὄμμα',
 'μένω',
 'ἀπειλή',
 'σκώληξ',
 'σῶμα',
 'ἀπουσία',
 'ἐπιστρεφόμενον',
 'ἐκβράσαν',
 'σῶμα',
 'ἐπιστρεφής',
 'οὗτος',
 'ἐκφεύγω',
 'θεός',
 'εἰμί',
 'διδαχθείς',
 'ἔχω',
 'ἀθάνατος',
 'σῶμα',
 'ἄφθαρτ',
 'ψυχή',
 'βασιλεία',
 'οὐρανός',
 'ἀπόληψις',
 'γῆ',
 'βίος',
 'ἐπουράνιος',
 'βασιλεύς',
 'ἐπιγνούς',
 'εἰμί',
 'ὁμιλητής',
 'θεός',
 'συγκληρονόμος',
 'Χριστός',
 'ἐπιθυμία',
 'πάθος',
 'νόσος',
 'δουλούμενος',
 'γίγνομαι',
 'θεός',
 'ὑπομένω',
 'πάθος',
 'ἄνθρωπος',
 'εἰμί',
 'δίδωμι',
 'ἄνθρωπος',
 'εἷς',
 'ὅσος',
 'παρακολουθέω',
 'θεός',
 'παρέχω',
 'ἐπαγγέλλομαι',
 'θεός',
 'ἐθεοποιήθης',
 'ἀθάνατος',
 'γεννηθείς',
 'τουτέστι',
 'γιγνώσκω',
 'ἐπιγιγνώσκω',
 'πεποιηκότα',
 'θεός',
 'ἐπιγιγνώσκω',
 'ἐπιγνωσθῆναι',
 'συμβαίνω',
 'καλουμένῳ',
 'φιλεχθρήσητε',
 'ἄνθρωπος',
 'παλινδρομεῖν',
 'διστάσητε',
 'Χριστός',
 'πᾶς',
 'θεός',
 'ἁμαρτία',
 'ἄνθρωπος',
 'ἀποπλύνω',
 'προστάσσω',
 'νέος',
 'παλαιός',
 'ἄνθρωπος',
 'ἀποτελῶν',
 'εἰκών',
 'καλέω',
 'ἀρχή',
 'τύπος',
 'ἐπιδε

In [15]:
print(len(words_flat_1to3))
print(len(words_flat_4to5))

705831
804723


In [16]:
# continuous bigrams & trigrams, crossing sentence divisions
bigrams_1to3 = [list(ngram) for ngram in nltk.bigrams(words_flat_1to3)]
bigrams_4to5 = [list(ngram) for ngram in nltk.bigrams(words_flat_4to5)]

trigrams_1to3 = [list(ngram) for ngram in nltk.trigrams(words_flat_1to3)]
trigrams_4to5 = [list(ngram) for ngram in nltk.trigrams(words_flat_4to5)]

In [17]:
# continuous bigrams & trigrams, within sentences only

sents_bigrams_1to3 = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents_1to3] for el in sublist]
sents_bigrams_4to5 = [list(el) for sublist in [[ng for ng in nltk.bigrams(sent)] for sent in sents_4to5] for el in sublist]

sents_trigrams_1to3 = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents_1to3] for el in sublist]
sents_trigrams_4to5 = [list(el) for sublist in [[ng for ng in nltk.trigrams(sent)] for sent in sents_4to5] for el in sublist]

In [18]:
def get_cooc(docs, vocabulary=None, min_freq=5):
    if vocabulary==None:
        lemmata, vocabulary = get_vocab(docs, min_freq)
    vec_bow = CountVectorizer(vocabulary=vocabulary, lowercase=False)
    bow = vec_bow.fit_transform([" ".join(list(set(sentence))) for sentence in docs])
    cooc  = bow.T * bow
    cooc = cooc / len(docs)
    return cooc, vocabulary

In [19]:
cooc_1to3, vocabulary_1to3 = get_cooc(sents_1to3 + sents_bigrams_1to3 + sents_trigrams_1to3, vocabulary=vocabulary_1to3[:2000])
cooc_4to5, vocabulary_4to5 = get_cooc(sents_4to5 + sents_bigrams_4to5 + sents_trigrams_4to5, vocabulary=vocabulary_4to5[:2000])

In [20]:
cooc_4to5.todense().shape

(2000, 2000)

In [21]:
pd.DataFrame(cooc_4to5.todense())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.054007,0.002586,0.003069,0.003025,0.002132,0.001373,0.001859,2.029446e-03,0.000890,0.001061,...,0.000008,0.000007,0.000009,0.000006,0.000008,0.000009,0.000003,0.000006,0.000008,0.000011
1,0.002586,0.039271,0.002352,0.001551,0.001248,0.001547,0.001249,1.077021e-03,0.001188,0.001053,...,0.000005,0.000010,0.000008,0.000006,0.000004,0.000001,0.000006,0.000021,0.000008,0.000004
2,0.003069,0.002352,0.037310,0.002056,0.001392,0.001159,0.001154,1.007331e-03,0.000898,0.000682,...,0.000002,0.000004,0.000006,0.000006,0.000012,0.000001,0.000003,0.000006,0.000004,0.000001
3,0.003025,0.001551,0.002056,0.037387,0.001209,0.001093,0.001558,2.158970e-03,0.000678,0.000558,...,0.000000,0.000007,0.000006,0.000004,0.000006,0.000005,0.000002,0.000013,0.000004,0.000008
4,0.002132,0.001248,0.001392,0.001209,0.028849,0.000981,0.000933,8.728798e-04,0.000747,0.000546,...,0.000006,0.000005,0.000004,0.000004,0.000011,0.000004,0.000000,0.000011,0.000003,0.000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000009,0.000001,0.000001,0.000005,0.000004,0.000008,0.000004,2.111806e-06,0.000004,0.000001,...,0.000000,0.000000,0.000001,0.000000,0.000000,0.000180,0.000000,0.000000,0.000000,0.000000
1996,0.000003,0.000006,0.000003,0.000002,0.000000,0.000002,0.000009,7.039354e-07,0.000004,0.000002,...,0.000000,0.000000,0.000001,0.000000,0.000000,0.000000,0.000182,0.000000,0.000000,0.000000
1997,0.000006,0.000021,0.000006,0.000013,0.000011,0.000007,0.000008,3.519677e-06,0.000004,0.000006,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000187,0.000000,0.000000
1998,0.000008,0.000008,0.000004,0.000004,0.000003,0.000006,0.000000,2.815741e-06,0.000006,0.000006,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000179,0.000000


In [22]:
def normalize_ppmi3_matrix(pmi_matrix_df):
    minval, maxval = pmi_matrix_df.min().min(), pmi_matrix_df.max().max()
    diff = abs(maxval-minval)
    minval_doubled = minval - diff
    pmi_matrix_df.fillna(minval_doubled, inplace=True)
    pmi_matrix_norm_df = (pmi_matrix_df - minval_doubled) / (maxval - minval_doubled)
    return pmi_matrix_norm_df

In [23]:
def get_ppmi_df(cooc, vocabulary, normalize=True, exp=2):
    pmi_rows_list = []
    for i in range(cooc.shape[1]):
        ab = np.array([row_el for row_el in list(cooc[i].toarray()[0])], dtype=float)
        ab_exp = np.power(ab, exp)
        axb = np.array([cooc[row_el[0], row_el[0]] * cooc[i, i] for row_el in enumerate(list(cooc[i].toarray()[0]))], dtype=float)
        pmi_row = np.divide(ab_exp, axb, out=np.zeros_like(ab_exp), where=axb!=0)
        pmi_row = [np.log(n) if n>0 else None for n in pmi_row]
        pmi_rows_list.append(pmi_row)
    pmi_matrix_df = pd.DataFrame(pmi_rows_list, columns=vocabulary, index=vocabulary)
    if normalize == True:
        pmi_matrix_df = normalize_ppmi3_matrix(pmi_matrix_df)
        np.fill_diagonal(pmi_matrix_df.to_numpy(), 1)
    return pmi_matrix_df #pmi_matrix_norm_df

In [24]:
# test...
ppmi_matrix = get_ppmi_df(cooc_1to3, vocabulary_1to3, exp=2)
ppmi_matrix

Unnamed: 0,εἰμί,λέγω,θεός,οὗτος,αὐτός,πᾶς,γίγνομαι,λόγος,ἄνθρωπος,ἔχω,...,διαφθορά,ἀστεῖος,τετράπους,Τιμόθεος,προγιγνώσκω,Ἠσαῦ,ἀγός,ἐπικατάρατος,εἰλικρινής,ἐπιεικής
εἰμί,1.000000,0.844593,0.858682,0.851344,0.847851,0.833992,0.816790,0.832309,0.829485,0.801647,...,0.611027,0.712267,0.683886,0.650736,0.692650,0.640500,0.000000,0.701843,0.682634,0.704386
λέγω,0.844593,1.000000,0.836977,0.857170,0.834168,0.814340,0.811712,0.817641,0.806948,0.804503,...,0.664336,0.638276,0.612217,0.643624,0.574855,0.673392,0.721751,0.659825,0.668505,0.664194
θεός,0.858682,0.836977,1.000000,0.822810,0.828806,0.834268,0.825915,0.848194,0.831868,0.803209,...,0.684985,0.661073,0.694251,0.678569,0.715929,0.638649,0.000000,0.000000,0.681956,0.634871
οὗτος,0.851344,0.857170,0.822810,1.000000,0.823465,0.816076,0.818339,0.820483,0.805663,0.807987,...,0.000000,0.623106,0.603747,0.642749,0.634866,0.669871,0.650791,0.628292,0.658284,0.666094
αὐτός,0.847851,0.834168,0.828806,0.823465,1.000000,0.819105,0.818108,0.819242,0.799562,0.798830,...,0.732836,0.653055,0.670347,0.630497,0.708919,0.644926,0.000000,0.649473,0.716471,0.670205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ἠσαῦ,0.640500,0.673392,0.638649,0.669871,0.644926,0.554092,0.646738,0.595658,0.561296,0.626292,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
ἀγός,0.000000,0.721751,0.000000,0.650791,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
ἐπικατάρατος,0.701843,0.659825,0.000000,0.628292,0.649473,0.719791,0.599166,0.652322,0.748932,0.686628,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
εἰλικρινής,0.682634,0.668505,0.681956,0.658284,0.716471,0.659888,0.660989,0.662027,0.667092,0.650461,...,0.000000,0.700377,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000


In [25]:
ppmi_matrix.iloc[0].tolist()

[1.0,
 0.8445926219142675,
 0.8586822927596901,
 0.8513439336653383,
 0.8478510019462848,
 0.833992411531144,
 0.8167897012357772,
 0.8323093681927323,
 0.8294853121027841,
 0.8016466329697896,
 0.8062014276715063,
 0.8365129198210586,
 0.793631625731796,
 0.8113052509243767,
 0.8071108907028384,
 0.7939292556549651,
 0.8125971793203344,
 0.8161327537517985,
 0.8003635938495581,
 0.7875902773946784,
 0.7995722044746016,
 0.8172002053772364,
 0.7925582425972273,
 0.8015044228662127,
 0.8207800964948999,
 0.7723587551450364,
 0.7742658176609052,
 0.8067773058006547,
 0.7953780966617187,
 0.8203833968068805,
 0.7580626417287376,
 0.8110401155454767,
 0.7882880849537819,
 0.7855531834673753,
 0.764882437776267,
 0.7985707378720001,
 0.7833783176828928,
 0.7905859442457224,
 0.7732425564969587,
 0.8031060786081133,
 0.7853316316262007,
 0.7840731799695404,
 0.7973990649878561,
 0.7850153048704115,
 0.7734464663556997,
 0.805271932240688,
 0.7831066488814394,
 0.7688654311879287,
 0.77499337

In [26]:
def svd_reduction(cooc_matrix, n_components=150, random_state=1, n_iter=10):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state, n_iter=n_iter)
    svd_matrix = svd.fit_transform(cooc_matrix)
    return svd_matrix

In [27]:
svd_matrix = pd.DataFrame(svd_reduction(ppmi_matrix))

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [28]:
# test compare random two vectors, both raw and SVD factorized
rows = [0,98]
print(1 - spatial.distance.cosine(ppmi_matrix.iloc[rows[0]].tolist(), ppmi_matrix.iloc[rows[1]].tolist()))
print(1 - spatial.distance.cosine(svd_matrix.iloc[rows[0]].tolist(), svd_matrix.iloc[rows[1]].tolist()))

0.8201615286843972
0.9135114007289845


In [29]:
# test compare random two vectors, both raw and SVD factorized
rows = [0,1902]
print(1 - spatial.distance.cosine(ppmi_matrix.iloc[rows[0]].tolist(), ppmi_matrix.iloc[rows[1]].tolist()))
print(1 - spatial.distance.cosine(svd_matrix.iloc[rows[0]].tolist(), svd_matrix.iloc[rows[1]].tolist()))

0.27637343680135684
0.42419370537528955


In [30]:
# test compare random two vectors, both raw and SVD factorized
rows = [1900,1902]
print(1 - spatial.distance.cosine(ppmi_matrix.iloc[rows[0]].tolist(), ppmi_matrix.iloc[rows[1]].tolist()))
print(1 - spatial.distance.cosine(svd_matrix.iloc[rows[0]].tolist(), svd_matrix.iloc[rows[1]].tolist()))

0.2656633102558976
0.5963015293601855


In [31]:
ppmi_matrix.iloc[1]

εἰμί            0.844593
λέγω            1.000000
θεός            0.836977
οὗτος           0.857170
αὐτός           0.834168
                  ...   
Ἠσαῦ            0.673392
ἀγός            0.721751
ἐπικατάρατος    0.659825
εἰλικρινής      0.668505
ἐπιεικής        0.664194
Name: λέγω, Length: 2000, dtype: float64

In [32]:
def from_docs_to_embeddings(docs, vocabulary=None, min_freq=5):
    cooc, vocabulary = get_cooc(docs, vocabulary=vocabulary, min_freq=min_freq)
    pmi_matrix = get_ppmi_df(cooc, vocabulary)
    word_vectors_array = svd_reduction(pmi_matrix, n_components=150, random_state=1, n_iter=10)
    word_vectors_df = pd.DataFrame(word_vectors_array, index=vocabulary)
    pmi_svd_cos = pd.DataFrame(cosine_similarity(word_vectors_array), columns=vocabulary, index=vocabulary)
    return [cooc, vocabulary, pmi_matrix, word_vectors_df, pmi_svd_cos]

In [33]:
%%time
data_1to3 = from_docs_to_embeddings(sents_1to3 + bigrams_1to3, vocabulary=vocabulary_1to3[:2000])
data_4to5 = from_docs_to_embeddings(sents_4to5 + bigrams_4to5, vocabulary=vocabulary_4to5[:2000])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


CPU times: user 2min 13s, sys: 1.76 s, total: 2min 15s
Wall time: 2min 5s


In [34]:
data_1to3[-1]

Unnamed: 0,εἰμί,λέγω,θεός,οὗτος,αὐτός,πᾶς,γίγνομαι,λόγος,ἄνθρωπος,ἔχω,...,διαφθορά,ἀστεῖος,τετράπους,Τιμόθεος,προγιγνώσκω,Ἠσαῦ,ἀγός,ἐπικατάρατος,εἰλικρινής,ἐπιεικής
εἰμί,1.000000,0.999414,0.999396,0.999797,0.999411,0.999240,0.999426,0.998424,0.997860,0.998832,...,0.479306,0.488988,0.396114,0.464423,0.462820,0.444603,0.362465,0.415683,0.539889,0.498687
λέγω,0.999414,1.000000,0.998942,0.999462,0.998892,0.998599,0.999008,0.997959,0.997491,0.998262,...,0.482979,0.487113,0.393836,0.465612,0.464885,0.448117,0.367451,0.419498,0.538425,0.498750
θεός,0.999396,0.998942,1.000000,0.999341,0.999284,0.999445,0.999108,0.997965,0.997992,0.998210,...,0.484193,0.489931,0.400092,0.469545,0.465575,0.448946,0.365327,0.419867,0.542845,0.501288
οὗτος,0.999797,0.999462,0.999341,1.000000,0.999610,0.999231,0.999282,0.998751,0.998058,0.998776,...,0.480379,0.487695,0.394921,0.464356,0.463495,0.446275,0.363591,0.414418,0.540992,0.500783
αὐτός,0.999411,0.998892,0.999284,0.999610,1.000000,0.999208,0.998801,0.998409,0.998291,0.998423,...,0.482126,0.489200,0.397144,0.464445,0.465545,0.447702,0.357497,0.413304,0.542627,0.502368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ἠσαῦ,0.444603,0.448117,0.448946,0.446275,0.447702,0.448971,0.449982,0.449322,0.452650,0.450269,...,0.605679,0.600913,0.406747,0.586849,0.625993,1.000000,0.609692,0.651709,0.518484,0.572262
ἀγός,0.362465,0.367451,0.365327,0.363591,0.357497,0.364253,0.367413,0.367840,0.369218,0.367879,...,0.551230,0.591386,0.415571,0.593518,0.554006,0.609692,1.000000,0.626738,0.471233,0.560774
ἐπικατάρατος,0.415683,0.419498,0.419867,0.414418,0.413304,0.419945,0.419410,0.412478,0.427419,0.420612,...,0.561707,0.509276,0.436674,0.642985,0.484428,0.651709,0.626738,1.000000,0.482672,0.503636
εἰλικρινής,0.539889,0.538425,0.542845,0.540992,0.542627,0.548935,0.541249,0.554593,0.554535,0.545084,...,0.573848,0.661916,0.484152,0.620912,0.585377,0.518484,0.471233,0.482672,1.000000,0.574721


In [35]:
pickle.dump(data_1to3, open("../data/large_files/embeddings_sents+bgs+tgs_ppmi2_1to3.pkl", "wb"), pickle.HIGHEST_PROTOCOL)
pickle.dump(data_4to5, open("../data/large_files/embeddings_sents+bgs+tgs_ppmi2_4to5.pkl", "wb"), pickle.HIGHEST_PROTOCOL)