In [1]:
import sgt
from sgt import SGT
import numpy as np
import pandas as pd
from numpy.linalg import norm

sgt = SGT(flatten=True)
sequence = np.array(["B","B","A","C","A","C","A","A","B","A"])
sgt.fit(sequence)

(A, A)    0.090616
(A, B)    0.131002
(A, C)    0.261849
(B, A)    0.086569
(B, B)    0.123042
(B, C)    0.052544
(C, A)    0.137142
(C, B)    0.028263
(C, C)    0.135335
dtype: float64

### testing a corpus

In [2]:
corpus = pd.DataFrame([[0, [1, 3, 4, 6, 10]], 
                       [1, [1, 3, 4, 7, 8]],
                       [2, [1, 3, 5, 6, 10]],
                       [3, [2, 1, 8, 9, 7]],
                      ], 
                      columns=['id', 'sequence'])
# Learning the sgt embeddings as vector for
# all sequences in a corpus.
# mode: 'default'
sgt = SGT(kappa=1, 
          flatten=True, 
          lengthsensitive=False, 
          mode='default')
embedding = sgt.fit_transform(corpus)

In [3]:
embedding

Unnamed: 0,id,"(1, 1)","(1, 2)","(1, 3)","(1, 4)","(1, 5)","(1, 6)","(1, 7)","(1, 8)","(1, 9)",...,"(10, 1)","(10, 2)","(10, 3)","(10, 4)","(10, 5)","(10, 6)","(10, 7)","(10, 8)","(10, 9)","(10, 10)"
0,0.0,0.0,0.0,0.367879,0.135335,0.0,0.049787,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.367879,0.135335,0.0,0.0,0.049787,0.018316,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.367879,0.0,0.135335,0.049787,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049787,0.367879,0.135335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
embedding = np.array(embedding.iloc[:, 1:])

In [7]:
n = len(embedding)
distance = np.full((n, n), None)
cos_sim = np.full((n, n), None)

for i in range(n):
    for j in range(i + 1, n):
        distance[i][j] = norm(embedding[i]-embedding[j])   
        cos_sim[i][j] = np.inner(embedding[i], embedding[j]) / (norm(embedding[i]) * norm(embedding[j]))

In [8]:
pd.DataFrame(distance)

Unnamed: 0,0,1,2,3
0,,0.790689,0.783967,1.09689
1,,,0.965656,1.08845
2,,,,1.09689
3,,,,


In [9]:
pd.DataFrame(cos_sim)

Unnamed: 0,0,1,2,3
0,,0.480378,0.489176,0.0
1,,,0.224966,0.0153208
2,,,,0.0
3,,,,


In [43]:
alphabets = range(-1, 157)
corpus = pd.DataFrame([[0, [20,57,100, 116, 116, 116, 3, 3, 128,  67,  67,  67, 135, 135,  88,  -1,  -1,  -1,
   -1,  96,  96,  78, 119 , 83]], 
                       [1,  [ 82 , 82 , 82  ,82 , 12 , 40 , 50 , 86 , 86  ,31 ,  1 , 45  ,56 , 56 , 56 , 35  ,35 , 12
   ,50 , 50 , 33,  33 , 33, 117]],
                        [2,  [20,57,100, 116, 116, 116, 3,  86 , 86  ,31 ,  1 , 45  ,56 , 56 , 56 , 35  ,35 , 12
   ,50 , 50 , 33,  33 , 33, 117]]
                      ],
                      columns=['id', 'sequence'])
# Learning the sgt embeddings as vector for
# all sequences in a corpus.
# mode: 'default'
sgt = SGT(alphabets=alphabets,
          flatten=True, 
          lengthsensitive=False, 
          mode='default')
embedding = sgt.fit_transform(corpus)

In [44]:
pd.options.display.max_colwidth = 200
corpus

Unnamed: 0,id,sequence
0,0,"[20, 57, 100, 116, 116, 116, 3, 3, 128, 67, 67, 67, 135, 135, 88, -1, -1, -1, -1, 96, 96, 78, 119, 83]"
1,1,"[82, 82, 82, 82, 12, 40, 50, 86, 86, 31, 1, 45, 56, 56, 56, 35, 35, 12, 50, 50, 33, 33, 33, 117]"
2,2,"[20, 57, 100, 116, 116, 116, 3, 86, 86, 31, 1, 45, 56, 56, 56, 35, 35, 12, 50, 50, 33, 33, 33, 117]"


In [45]:
embedding

Unnamed: 0,id,"(-1, -1)","(-1, 0)","(-1, 1)","(-1, 2)","(-1, 3)","(-1, 4)","(-1, 5)","(-1, 6)","(-1, 7)",...,"(156, 147)","(156, 148)","(156, 149)","(156, 150)","(156, 151)","(156, 152)","(156, 153)","(156, 154)","(156, 155)","(156, 156)"
0,0.0,0.237349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
embedding = np.array(embedding.iloc[:, 1:])
print(f'Distance between 0 and 1: {np.linalg.norm(embedding[0]-embedding[1])}') 
print(f'Distance between 0 and 2: {np.linalg.norm(embedding[0]-embedding[2])}')
print(f'Distance between 1 and 2: {np.linalg.norm(embedding[1]-embedding[2])}')

Distance between 0 and 1: 1.6844195632346601
Distance between 0 and 2: 1.5382730486223746
Distance between 1 and 2: 0.9400599241681148


In [41]:
cos_sim_01= np.inner(embedding[0], embedding[1]) / (norm(embedding[0]) * norm(embedding[1]))
cos_sim_02= np.inner(embedding[0], embedding[2]) / (norm(embedding[0]) * norm(embedding[2]))
cos_sim_12= np.inner(embedding[1], embedding[2]) / (norm(embedding[1]) * norm(embedding[2]))
print(f'Cosine similarity between 0 and 1: {cos_sim_01}')
print(f'Cosine similarity between 0 and 2: {cos_sim_02}')
print(f'Cosine similarity between 1 and 2: {cos_sim_12}')

Cosine similarity between 0 and 1: 0.0
Cosine similarity between 0 and 2: 0.2695321723266291
Cosine similarity between 1 and 2: 0.7096687409807811
