In [1]:
from collections import Counter
import numpy as np
import time
import itertools
import pickle
import seaborn as sns
import matplotlib.pylab as plt


In [2]:
#transfer the tags of a text into a probability distribution by splitting the text in overlapping slices of three   
def get_probs(tag_list,slide_len=1):
    
    #Make a list holding all the possible combinations of lenght 3 of the three tags
    tags=list(itertools.product(["C", "P", "N"],repeat=slide_len))

    for i in range(len(tags)):
        tags[i]=''.join(tags[i])

    dict_list=[]
    
    
    for text in tag_list:
        
        
        #create a dictonary that has all tag combinations as its key and starting values of 0  
        slide_dict={el:0 for el in tags}
        
        
        for i in range(len(text)):
            
            #walk through the tag sequences for each text apply a sliding window of size 3
            #add the slides into the dict
            #the idea is to count the distribution of the possible tag permutations in the texts
            
            if text[i:i+slide_len] in slide_dict.keys():
                slide_dict[text[i:i+slide_len]]=slide_dict[text[i:i+slide_len]]+1

                
        #change the absolute number into a probability by dividing by the total number of slides in the text       
        leng=sum(slide_dict.values())

        for key in slide_dict:
            slide_dict[key]=slide_dict[key]/leng
        
        #add the probability values as a list
        dict_list.append(list(slide_dict.values()))
    
    
    return dict_list

In [3]:
def H(p, q):
  # distance between p an d
  # p and q are np array probability distributions
  n = len(p)
  sum = 0.0
  for i in range(n):
    sum += (np.sqrt(p[i]) - np.sqrt(q[i]))**2
  result = (1.0 / np.sqrt(2.0)) * np.sqrt(sum)

  return result

In [4]:
def get_dis_mat(tag_list1,tag_list2):

    dis = np.zeros((len(tag_list1), len(tag_list2) ))

    x=0
    y=0

    length_x=len(tag_list1)
    length_y=len(tag_list2)

    for i,x in zip(tag_list1,range(length_x)):
        
        
        for j,y in zip(tag_list2,range(length_y)):
                  
            dis[x,y]=H(i,j)
           
    return dis

In [5]:
def get_results(corpus1,corpus2,slide_len=3,empty=False,show_plot=False):

    with open("../Data/tag_"+corpus1+".pkl", "rb") as f:
        tag_list_1 = pickle.load(f)

    with open("../Data/tag_"+corpus2+".pkl", "rb") as f:
        tag_list_2 = pickle.load(f)
    print("Corpus one is:",corpus1 ,"Corpus two is:",corpus2)
    
    if empty==True:
        tag_list_1=[s.replace('N', '') for s in tag_list_1]
        tag_list_2=[s.replace('N', '') for s in tag_list_2]
        
    probs_1=get_probs(tag_list_1,slide_len=slide_len)
    probs_2=get_probs(tag_list_2,slide_len=slide_len)

    dis=get_dis_mat(probs_1,probs_2)

    print("The overall similarity is:",round(np.mean(dis),2))
    print("The standard deviation is:",round(np.std(dis),2))
    print()



    if show_plot ==True:
        mask = np.zeros_like(dis)
        mask[np.triu_indices_from(mask)] = True
        with sns.axes_style("white"):
            ax = sns.heatmap(dis, mask=mask, square=True,  cmap="YlGnBu")
            plt.show()





In [7]:
tag_list=["usdeb","essay","micro","hidey"]

for i in tag_list:
    
        for j in tag_list:
            get_results(i,j,slide_len=1,empty=True)

Corpus one is: usdeb Corpus two is: usdeb
The overall similarity is: 0.09
The standard deviation is: 0.07

Corpus one is: usdeb Corpus two is: essay
The overall similarity is: 0.13
The standard deviation is: 0.09

Corpus one is: usdeb Corpus two is: micro
The overall similarity is: 0.26
The standard deviation is: 0.09

Corpus one is: usdeb Corpus two is: hidey
The overall similarity is: 0.11
The standard deviation is: 0.09

Corpus one is: essay Corpus two is: usdeb
The overall similarity is: 0.13
The standard deviation is: 0.09

Corpus one is: essay Corpus two is: essay
The overall similarity is: 0.07
The standard deviation is: 0.06

Corpus one is: essay Corpus two is: micro
The overall similarity is: 0.14
The standard deviation is: 0.07

Corpus one is: essay Corpus two is: hidey
The overall similarity is: 0.1
The standard deviation is: 0.08

Corpus one is: micro Corpus two is: usdeb
The overall similarity is: 0.26
The standard deviation is: 0.09

Corpus one is: micro Corpus two is: es