In [1]:
from sklearn.neighbors import KDTree
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import pickle
from tqdm import tqdm
DATA_DIR = "../data/"
COSTHRESH = 0.55

In [2]:
with open("embed_dict.pkl",'rb') as f:
    embeddings_index = pickle.load(f)

with open("file_to_words_tf_idf.json") as f:
    file_keywords = json.load(f)

keywords = {}
# To create a tree
for filename in file_keywords:
    for word in file_keywords[filename]:
        if word in keywords:
            keywords[word] +=1
        else:
            keywords[word] = 1

In [3]:
# To create a tree

key_index = {}
index_key ={}
keyword_matrix = []
sz = len(embeddings_index.get("court"))
idx = 0
for i, key in enumerate(reversed(sorted(keywords.items(), key=lambda x: x[1]))):
    if key[0] in embeddings_index:
        key_index[key[0]] = idx
        index_key[idx] = key[0]
        keyword_matrix.append(embeddings_index.get(key[0]))
        idx+=1
    
keyword_matrix = np.array(keyword_matrix)
tree = KDTree(keyword_matrix)

In [4]:
def get_similar_words(inword, numwords):
    if inword not in embeddings_index:
        return []
    dist, ind = tree.query(embeddings_index.get(inword).reshape(1,-1),k=numwords) 
    simlist = []
    for hkey in ind[0]:
#         print(index_key[hkey],inword) # correct 
        simlist.append(index_key[hkey])
    return simlist

def get_keyword(lstkeys):
    actual_word = {}
    for word in lstkeys:
        similar_words = get_similar_words(word,1)
        if len(similar_words)==0:
            continue
        if cosine_similarity(embeddings_index.get(similar_words[0]).reshape(1,-1),embeddings_index.get(word.lower()).reshape(1,-1))[0] > COSTHRESH:
            actual_word[word] = similar_words[0]
#             print(actual_word[word],similar_words[0]," dfd")
    return actual_word

In [8]:
with open("keywords_to_cases.json") as ktc:
    case_keywords = json.load(ktc) 

In [5]:
keytocase = {}
for file in file_keywords:
    for word in file_keywords[file]:
        if word in keytocase:
            keytocase[word].append(file)
        else:
            keytocase[word] = [file]
        

In [6]:
def get_similar_cases(lstkeys,topn):
    keywords = get_keyword(lstkeys)
    case_score = {}
    for keyword in keywords:
        true_keyword = keywords[keyword]
        if true_keyword in keytocase:
            for case in keytocase[true_keyword]:
                if case in case_score:
                    case_score[case].append(keyword)
                else:
                    case_score[case] = [keyword]
    similar_cases = []
    for i,key in enumerate(reversed(sorted(case_score.items(), key=lambda x: len(x[1])))):
        if i == topn:
            break
#         print(key[0],key[1])
        similar_cases.append((key[0],key[1]))
    return similar_cases

In [8]:
TOP_SIMILAR = 5

sim_cases = {}
for filename in tqdm(file_keywords):
    lstkeys = file_keywords[filename]
    sim_list = []
    for case in get_similar_cases(lstkeys,TOP_SIMILAR + 1):
        if case[0] != filename:
            sim_list.append(case[0])
    if len(sim_list)== TOP_SIMILAR + 1:
        sim_cases[filename] = sim_list[:-1]
    else:
        sim_cases[filename] = sim_list

with open("similar_cases.json","w+") as f:
    json.dump(sim_cases,f)
    
lstkeys = ["India", "paid", "cusmer"]
get_similar_cases(lstkeys,10)


100%|██████████| 53211/53211 [41:18<00:00, 20.16it/s] 


[('1998_S_440', ['India', 'paid']),
 ('1998_N_37', ['India', 'paid']),
 ('1997_A_90', ['India', 'paid']),
 ('1983_K_7', ['India', 'paid']),
 ('2016_U_25', ['India', 'paid']),
 ('1997_B_130', ['India', 'paid']),
 ('2004_C_197', ['India', 'paid']),
 ('2009_C_134', ['India', 'paid']),
 ('2015_J_11', ['India', 'paid']),
 ('1990_U_13', ['India', 'paid'])]

In [9]:
sim_cases

{'2005_F_3': ['2017_U_46',
  '1996_T_64',
  '1993_S_38',
  '2004_S_137',
  '1994_S_140'],
 '1970_S_262': ['1970_V_3',
  '1974_D_35',
  '1972_G_28',
  '1970_P_25',
  '1971_S_34'],
 '2008_S_160': ['2007_A_120',
  '2008_G_161',
  '2000_N_13',
  '1963_M_37',
  '2001_H_31'],
 '2000_L_27': ['2010_B_15',
  '2005_S_109',
  '2003_J_80',
  '2009_T_71',
  '2008_S_780'],
 '1969_U_28': ['2015_D_35', '1973_R_39', '2015_D_38', '1976_K_3', '2003_S_34'],
 '2009_S_249': ['2006_M_280',
  '2001_S_683',
  '1975_S_145',
  '1996_S_707',
  '1996_S_945'],
 '1992_G_23': ['1996_S_683',
  '1973_T_56',
  '2009_R_192',
  '2007_S_282',
  '1997_M_52'],
 '2001_S_1081': ['2009_V_118',
  '1962_T_94',
  '2013_G_1',
  '2001_S_154',
  '2007_D_109'],
 '2008_M_258': ['2004_A_60',
  '2007_U_206',
  '2010_M_167',
  '1997_K_50',
  '1967_K_14'],
 '2010_M_10': ['1986_T_4', '2013_N_44', '1990_B_30', '2011_B_35', '2012_G_2'],
 '2017_R_41': ['1994_U_123',
  '2010_S_32',
  '2003_U_88',
  '1990_D_86',
  '2006_K_50'],
 '1967_H_10': ['1

In [14]:
lstkeys = ["India", "paid", "cusmer"]
get_similar_cases(lstkeys,10)

[('1953_M_12', ['India', 'paid']),
 ('1960_S_78', ['India', 'paid']),
 ('1955_A_12', ['India', 'paid']),
 ('1966_B_15', ['India', 'paid']),
 ('1957_T_66', ['India', 'paid']),
 ('1962_S_113', ['India', 'paid']),
 ('1962_S_93', ['India', 'paid']),
 ('1983_U_15', ['India', 'paid']),
 ('1953_T_14', ['India', 'paid']),
 ('1968_R_36', ['India', 'paid'])]

In [31]:
for filename in tqdm(file_keywords):
    lstkeys = file_keywords[filename]
    sim_list = []
    for case in get_similar_cases(lstkeys,5):
        sim_list.append(case[0])
    print(file_keywords[filename],file_keywords[sim_list[0]],sim_list[0])
    break

  0%|          | 0/53211 [00:00<?, ?it/s]

mark mark  dfd
sheet sheet  dfd
selection selection  dfd
tribunal tribunal  dfd
grade grade  dfd
test test  dfd
candidate candidate  dfd
practical practical  dfd
review review  dfd
affidavit affidavit  dfd
applicant applicant  dfd
minute minute  dfd
dispose dispose  dfd
weight weight  dfd
engineer engineer  dfd
note note  dfd
advert advert  dfd
cent cent  dfd
trade trade  dfd
representation representation  dfd
2001_K_18 ['advert', 'weight', 'candidate', 'mark', 'test', 'practical', 'engineer', 'selection', 'sheet', 'cent', 'dispose', 'tribunal', 'grade', 'affidavit', 'note', 'representation', 'minute', 'review', 'applicant', 'trade']
2001_A_276 ['candidate', 'mark', 'test', 'engineer', 'selection', 'tribunal', 'applicant']
2005_R_56 ['selection', 'dispose', 'tribunal', 'grade', 'review', 'applicant']
2006_S_65 ['weight', 'candidate', 'mark', 'engineer', 'selection', 'grade']
2009_B_10 ['candidate', 'engineer', 'selection', 'tribunal', 'minute', 'review']
['mark', 'sheet', 'selection', 




In [28]:
file_keywords

{'2001_K_18': ['mark',
  'sheet',
  'selection',
  'tribunal',
  'grade',
  'test',
  'candidate',
  'practical',
  'review',
  'affidavit',
  'applicant',
  'minute',
  'dispose',
  'weight',
  'engineer',
  'note',
  'advert',
  'cent',
  'trade',
  'representation'],
 '1964_K_42': ['profit',
  'decree',
  'subordinate',
  'holder',
  'item',
  'fix',
  'raise',
  'delivery',
  'rate',
  'period',
  'preliminary',
  'possession',
  'suit',
  'construe',
  'decreed',
  'debtor',
  'enquiry',
  'final',
  'rent',
  'commissioner'],
 '2007_C_157': ['assessee',
  'tax',
  'sale',
  'excise',
  'duty',
  'exemption',
  'collect',
  'deposit',
  'payable',
  'price',
  'paid',
  'central',
  'calculate',
  'amount',
  'intimation',
  'commissioner',
  'deduct',
  'reply',
  'tribunal',
  'element'],
 '2007_P_133': ['notification',
  'amendment',
  'refund',
  'fee',
  'unreasonable',
  'june',
  'company',
  'ultra',
  'patna',
  'vires',
  'strike',
  'capital',
  'struck',
  'schedule',
