In [1]:
import os
import re
import time
from random import randint
from decimal import *

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('sample.csv')
data_doc = {}
i = 0
t0 = time.time()
for value in data["content"]:
    data = re.sub("[^\w]", " ", value.lower())
    data = ' '.join(data.split())
    data = data.replace('\r', '')
    data = data.replace('\t', '')
    data_doc[i] = data.split()
    if data_doc[i]:
        i = i+1
    else:
        del data_doc[i]
print(time.time() - t0)
print("Done")

0.06328558921813965
Done


In [5]:
def shingling(documents, shingle_size=8,):

    if os.path.exists('shingle_matrix.csv'):
        print("Using Previous Pre-Processed Data")
        shingle_matrix = pd.read_csv("shingle_matrix.csv")
        return shingle_matrix

    shingle_sets = {}
    shingles = set()
    t0 = time.time()

    print("Data Processing")

    for i in range(0, len(documents)):
        words = documents[i]

        shingles_in_doc = set()

        for index in range(len(words) - shingle_size + 1):

            # K-Shingle formation
            shingle = words[index:index+shingle_size]
            shingle = ' '.join(shingle)

            if shingle not in shingles:
                shingles.add(shingle)

            if shingle not in shingles_in_doc:
                shingles_in_doc.add(shingle)

            else:
                del shingle
                index = index - 1

        shingle_sets[i] = shingles_in_doc

    shingle_matrix = pd.DataFrame(index=[i for i in shingles], columns=[
                                  i for i in range(len(shingle_sets))])
    print(shingle_matrix)

    for value in shingles:
        for j in range(len(shingle_sets)):
            if value in shingle_sets[j]:
                shingle_matrix.at[value, j] = 1
            else:
                shingle_matrix.at[value, j] = 0
    del shingles_in_doc, words, shingle_sets, shingles

    shingle_matrix.to_csv('shingle_matrix.csv')
    print("Time took for shingling: {}".format(time.time()-t0)+"s")

    return shingle_matrix


sparse_matrix = shingling(data_doc)

Data Processing
                                                     0    1    2    3    4   \
hire day laborers once the workers gathered around  NaN  NaN  NaN  NaN  NaN   
prevailed inside the company if one sells sandw...  NaN  NaN  NaN  NaN  NaN   
be attorney general has either not gotten meetings  NaN  NaN  NaN  NaN  NaN   
to broadcasters ms kelly s special performed about  NaN  NaN  NaN  NaN  NaN   
while facing slightly expanded democratic minor...  NaN  NaN  NaN  NaN  NaN   
...                                                 ...  ...  ...  ...  ...   
managed to avoid scaring her by leaving out         NaN  NaN  NaN  NaN  NaN   
names of some of the clients targets in             NaN  NaN  NaN  NaN  NaN   
virtuous kind and gentle people he could ever       NaN  NaN  NaN  NaN  NaN   
you re stripped of everything except your dignity   NaN  NaN  NaN  NaN  NaN   
floundered in the ratings among his moves since     NaN  NaN  NaN  NaN  NaN   

                                   

In [6]:
def hash_coeff(rows, no_of_hash_functions=100):

    hashes = []
    c = rows

    for i in range(no_of_hash_functions):
        def hash(x): 
            return (randint(1, 10*c)*int(x) + randint(1, 10*c)) % c
        hashes.append(hash)

    return hashes

In [9]:
def minhash_sig_matrix(sparse_matrix, hash_functions=100,):
    if os.path.exists("minhash_matrix.csv"):
        print("Pre Processed Data")
        minhash_matrix = pd.read_csv("minhash_matrix.csv")
        return minhash_matrix

    t0 = time.time()

    minhash_matrix = pd.DataFrame(index=[i for i in range(
        hash_functions)], columns=sparse_matrix.columns)

    rows, cols = sparse_matrix.shape

    signature = hash_coeff(rows)

    for i in sparse_matrix.index:
        for j in sparse_matrix.columns:
            if sparse_matrix.at[i, j] == 1:
                for k in range(hash_functions):
                    if np.isnan(minhash_matrix.at[k, j]):
                        minhash_matrix.at[k, j] = signature[k](j)
                    else:
                        minhash_matrix.at[k, j] = min(
                            minhash_matrix.at[k, j], signature[k](j))
    minhash_matrix = minhash_matrix.loc[:, ~minhash_matrix.columns.str.contains('^Unnamed')]
    minhash_matrix.to_csv("minhash_matrix.csv")

    print(time.time() - t0)

minhash_matrix = minhash_sig_matrix(sparse_matrix)

KeyboardInterrupt: 

In [None]:
def band_hashing(band, bucket_dict):
    for col in band.columns:
        h = hash(tuple(band[col].values))
        if h in bucket_dict:
            bucket_dict[h].append(col)
        else:
            bucket_dict[h] = [col]

In [None]:
def get_bucket_list(minhash_matrix, r):

    n = minhash_matrix.shape[0]
    b = n//r

    bucket_list = [dict() for i in range(b)]

    for i in range(0, n-r+1, r):
        band = minhash_matrix.loc[i:i+r-1,:]

        band_hashing(band, bucket_list[int(i/r)])
        
    return bucket_list

In [None]:
bucket_list = get_bucket_list(minhash_matrix, 2)

In [None]:
print(bucket_list)