In [1]:
import textract
import os
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
root = Path(".")


In [2]:
# list of all doc names
files = list()
for dir in [r"\Auto", r"\Property"]:
    cur_dir = r".\Docs" + dir
    for file in os.listdir(cur_dir):
        cur_path = r".\Docs" + dir + "\\" + file
        files.append(cur_path)
files.sort()

In [3]:
# List of string with entire documents 
documents = list()

for file in files:
    cur = textract.process(file).decode("utf8")
    
    ans = list()
    for i in range(len(cur)):
        if cur[i].isalnum():
            ans.append(cur[i].lower())
        else:
            ans.append(' ')
    
    res = list()
    for i in range(len(ans) - 1):
        if ans[i] == ans[i + 1] and ans[i] == ' ':
            continue
        else:
            res.append(ans[i])
    
    if len(ans) > 0:
        res.append(ans[-1])
        documents.append(''.join(res))

In [4]:
documents

 'allstate austo insurance poliacy policy issued to m effective p l e d o c u m e anu127 1 t allstate insurance company stable of contents general 2 when and where the policy applies 2 changes 2 duty to report autos 2 combining limits of two or more autosa payment of benefits autopsy 9 consent of beneficiary 9 part 4 automobile disability income protection coverage cw 9proof of claim medical reports 9 prohibited 2 transfer 2 cancellation 2 insuring agreement 9 insured persons 9 definitions 9 part 1 automobilemliability insurance exclusions what is not covered 9 coverages aa and bb 3 insuring agreement 3 to whom and when payment is made 10 proof of claim medical reports 10 padditional payments allstate will make 3 insured persons 4 part 5 uninsured motorists insurance coverage ss 10 insured autos 4 definitions 4 insuring agreement 10 insured persons 10 exclusions what is not covered 4 definitions 11 financial responsibility l 5 exclusions what is not covered 11 limits of liability 5 if 

## Taking K

In [5]:
def get_shingle_set(documents, k):
    shingles = set()
    shingle_doc_id = dict()

    for j in range(len(documents)):
        for i in range(len(documents[j]) - k + 1):
            cur_shingle = documents[j][i : i + k]

            shingles.add(cur_shingle)
            
            if shingle_doc_id.get(cur_shingle) == None:
                shingle_doc_id[cur_shingle] = [j]
            else:
                shingle_doc_id[cur_shingle].append(j)
    
    return (shingles, shingle_doc_id)

In [6]:
for k in range(2, 15):
    print(k, len(get_shingle_set(documents, k)[0]))

2 782
3 5546
4 21106
5 54265
6 107153
7 176014
8 256246
9 340657
10 423212
11 502426
12 576688
13 645218
14 707339


so we will take K as 10

In [7]:
K = 10
shingles, shingle_doc_id = get_shingle_set(documents, K)
shingles = sorted(list(shingles))

In [8]:
shingles

[' 0 2 pen u',
 ' 0 20 3758',
 ' 0 250 les',
 ' 0 7176 17',
 ' 00 01 03 ',
 ' 00 01 06 ',
 ' 00 am to ',
 ' 00 conten',
 ' 00 email ',
 ' 00 noon o',
 ' 00 pm mon',
 ' 000 000 a',
 ' 000 000 f',
 ' 000 000 i',
 ' 000 000 l',
 ' 000 000 o',
 ' 000 000 p',
 ' 000 000 t',
 ' 000 000 w',
 ' 000 000 y',
 ' 000 2 5 o',
 ' 000 2 abl',
 ' 000 2082 ',
 ' 000 4 abl',
 ' 000 5 the',
 ' 000 500 w',
 ' 000 addit',
 ' 000 after',
 ' 000 alter',
 ' 000 an ad',
 ' 000 and 5',
 ' 000 and f',
 ' 000 and t',
 ' 000 any m',
 ' 000 any o',
 ' 000 as a ',
 ' 000 assau',
 ' 000 b in ',
 ' 000 b tot',
 ' 000 being',
 ' 000 break',
 ' 000 by a ',
 ' 000 c in ',
 ' 000 c los',
 ' 000 car a',
 ' 000 d tot',
 ' 000 damag',
 ' 000 deduc',
 ' 000 durin',
 ' 000 europ',
 ' 000 even ',
 ' 000 exces',
 ' 000 five ',
 ' 000 fly t',
 ' 000 for a',
 ' 000 for c',
 ' 000 for e',
 ' 000 for g',
 ' 000 for i',
 ' 000 for l',
 ' 000 for s',
 ' 000 for t',
 ' 000 for y',
 ' 000 fraud',
 ' 000 glass',
 ' 000 highe',
 ' 000 how

In [9]:
incident_matrix = np.zeros(shape=(len(shingles), len(documents)))
incident_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
shingle_id = dict()
for i in range(len(shingles)):
    shingle_id[shingles[i]] = i

for shingle, doc_ids in shingle_doc_id.items():
    for doc_id in doc_ids:
        incident_matrix[shingle_id[shingle]][doc_id] = 1

incident_matrix

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
my_path = root / "Pickled_files" / "Incident_Matrix"
dbfile = open(my_path, 'wb')
pickle.dump(incident_matrix, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Shingles"
dbfile = open(my_path, 'wb')
pickle.dump(shingles, dbfile) 
dbfile.close()

my_path = root / "Pickled_files" / "Shingle_id"
dbfile = open(my_path, 'wb')
pickle.dump(shingle_id, dbfile) 
dbfile.close()