In [1]:
import re
from typing import List
import random as r
import binascii
import numpy as np
import ipyparallel as ipp

In [30]:
def to_set_of_shingles(text: str, n: int, hash_to_int=True) -> set:
    words = text.split(" ")
    shingles = []
    for i in range(0, len(words) - 2):
        shingle = " ".join([word for word in words[i:i+n]])
        if hash_to_int:
            shingle = binascii.crc32(str.encode(shingle))
        shingles.append(shingle)
    return set(shingles)

def read_data(filename: str, n_shingles=3, n_data=-1) -> dict:
    data = []

    with open(filename, 'r', encoding='UTF-8', errors="replace") as file:
        i = 0
        pid = 0
        text = ""
        
        for line in file:
            if line.startswith("product/productId: "):
                pid = line.strip().split("product/productId: ")[1]
            if line.startswith("review/text: "):
                text = line.split("review/text: ")[1]  
                text = re.sub(r'<.*?>', ' ', text ).strip()  # remove html tags
            if line.strip() == "":
                data.append({"product_id": pid, "review_id": i, "text": text})
                i += 1
            if i == n_data:
                break
        
        return data
    
def get_random_coefficients(n: int) -> List[int]:
    cs = []
    for i in range(n):
        c = r.randint(0, 2**32-1)
        while c in cs:  # ensure c is unique
            c = r.randint(0, 2**32-1)
        cs.append(c)
    return cs

def create_signature(shingles: List[int], length: int, co_a: List[int],co_b: List[int] ) -> List[int]:
    
    signature = []
    
    coeffs_a = co_a
    coeffs_b = co_b
    
    for i in range(length):
        min_hash = 4294967311 + 1
        for shingle in shingles:
            shingle_hash = (coeffs_a[i] * shingle + coeffs_b[i]) % 4294967311
            if shingle_hash < min_hash:
                min_hash = shingle_hash
        signature.append(min_hash)
        
    return signature


def estimate_j_diff(sig1: List[int], sig2: List[int]) -> float:
    sig1 = np.array(sig1)
    sig2 = np.array(sig2)
    return 1-(np.sum(sig1 == sig2)/len(sig1))

In [20]:
N_DATA = 600
LEN_SHINGLE = 4
LEN_SIG = 10



#read data
reviews = read_data("movies.txt", n_shingles=4, n_data=N_DATA)

# convert to shingles
for review in reviews:
    review["shingles"] = to_set_of_shingles(review["text"], LEN_SHINGLE)

# create signatures
co_a = get_random_coefficients(LEN_SIG)
co_b = get_random_coefficients(LEN_SIG)
for review in reviews:
    review["signature"] = create_signature(review["shingles"], LEN_SIG, co_a,co_b)
    
# estimate jaccrad distance by signatures
for i in range(len(reviews)):
    for j in range(i+1, len(reviews)):
        r1 = reviews[i]
        r2 = reviews[j]
        d = estimate_j_diff(r1["signature"], r2["signature"])
        """if d < 1:  # print d<1 only
            print(r1["review_id"],r1["product_id"])
            print(r2["review_id"],r2["product_id"])
            print(d)"""

# receive the product id
print("Please insert Movie product id")            
movie_id = str(input())
id_found = False
# checks if the id is in the list of movies
for review in reviews:
    if movie_id in review["product_id"]:
        id_found = True
        
if id_found:
    print("Product id found")
    for i in range(len(reviews)):

        r1 = reviews["pid" == movie_id]
        r2 = reviews[i]
    if r1["review_id"] != r2["review_id"]:
        d = estimate_j_diff(r1["signature"], r2["signature"])
        if d < 1:  # print d<1 only
            print(r1["review_id"],r1["product_id"])
            print(r2["review_id"],r2["product_id"])
            print(d)
else: 
    print("Product id not found")

Starting 64 engines with <class 'ipyparallel.cluster.launcher.LocalEngineSetLauncher'>
100%|██████████| 64/64 [00:04<00:00, 10.33engine/s]
Stopping engine(s): 1668543849
engine set stopped 1668543849: {'engines': {'0': {'exit_code': 0, 'pid': 14729, 'identifier': '0'}, '1': {'exit_code': 0, 'pid': 14731, 'identifier': '1'}, '2': {'exit_code': 0, 'pid': 14733, 'identifier': '2'}, '3': {'exit_code': 0, 'pid': 14735, 'identifier': '3'}, '4': {'exit_code': 0, 'pid': 14741, 'identifier': '4'}, '5': {'exit_code': 0, 'pid': 14747, 'identifier': '5'}, '6': {'exit_code': 0, 'pid': 14753, 'identifier': '6'}, '7': {'exit_code': 0, 'pid': 14766, 'identifier': '7'}, '8': {'exit_code': 0, 'pid': 14786, 'identifier': '8'}, '9': {'exit_code': 0, 'pid': 14801, 'identifier': '9'}, '10': {'exit_code': 0, 'pid': 14819, 'identifier': '10'}, '11': {'exit_code': 0, 'pid': 14834, 'identifier': '11'}, '12': {'exit_code': 0, 'pid': 14855, 'identifier': '12'}, '13': {'exit_code': 0, 'pid': 14876, 'identifier': '

CompositeError: one or more exceptions raised in: [{'product_id': '0790747324', 'review_id': 599, 'text': 'Very interesting and well done.  Really enjoyed the story, acting and cinematography.'}]
[46:apply]TypeError: 'list' object is not callable
[53:apply]TypeError: 'list' object is not callable
[45:apply]TypeError: 'list' object is not callable
[63:apply]TypeError: 'list' object is not callable
.... 46 more exceptions ...