In [6]:
import pandas as pd
import numpy as np
import random
import itertools

random.seed(42)
np.random.seed(42)

ratings = pd.read_csv("rating.csv")
movies = pd.read_csv("movie.csv")

user_movie_sets = ratings.groupby('userId')['movieId'].apply(set).to_dict()

def generate_hash_functions(num_hashes, max_val, hash_type='linear'):
    hash_functions = []
    if hash_type == 'linear':
        for _ in range(num_hashes):
            a = random.randint(1, max_val)
            b = random.randint(0, max_val)
            hash_functions.append(lambda x, a=a, b=b, p=max_val+1: (a * x + b) % p)
    elif hash_type == 'modular':
        for _ in range(num_hashes):
            a = random.randint(1, max_val)
            b = random.randint(0, max_val)
            m = random.randint(max_val // 2, max_val)
            hash_functions.append(lambda x, a=a, b=b, m=m, n=max_val: ((a * x % m) + b) % n)
    elif hash_type == 'universal':
        for _ in range(num_hashes):
            a = random.randint(1, max_val)
            b = random.randint(0, max_val)
            c = random.randint(0, max_val)
            hash_functions.append(lambda x, a=a, b=b, c=c, p=max_val+1: ((a * x + b) ^ c) % p)
    
    return hash_functions

def compute_minhash_signature_chunk(user_sets, hash_functions):
    num_users = len(user_sets)
    num_movies = max(max(user_set) for user_set in user_sets.values())
    
    signature_matrix = np.full((len(hash_functions), num_users), np.inf)
    
    user_list = list(user_sets.keys())
    for user_idx, user in enumerate(user_list):
        for movie in user_sets[user]:
            for hash_idx, hash_func in enumerate(hash_functions):
                hash_value = hash_func(movie)
                if hash_value < signature_matrix[hash_idx, user_idx]:
                    signature_matrix[hash_idx, user_idx] = hash_value
    
    return signature_matrix, user_list

def jaccard_similarity(set1, set2):
    return len(set1 & set2) / len(set1 | set2) if len(set1 | set2) > 0 else 0

def evaluate_minhash_accuracy(signature_matrix, user_list, user_movie_sets, sample_size=100, thresholds=None):
    if thresholds is None:
        thresholds = [0.2, 0.4, 0.6, 0.8]
        
    user_indices = list(range(len(user_list)))
    sampled_pairs = random.sample(list(itertools.combinations(user_indices, 2)), sample_size)
    
    errors = []
    threshold_counts = {threshold: 0 for threshold in thresholds}
    
    for user_idx1, user_idx2 in sampled_pairs:
        user1, user2 = user_list[user_idx1], user_list[user_idx2]
        real_jaccard = jaccard_similarity(user_movie_sets[user1], user_movie_sets[user2])
        estimated_jaccard = np.mean(signature_matrix[:, user_idx1] == signature_matrix[:, user_idx2])
        
        errors.append(abs(real_jaccard - estimated_jaccard))
        
        for threshold in thresholds:
            if real_jaccard >= threshold:
                threshold_counts[threshold] += 1
    
    return np.mean(errors), threshold_counts

def run_minhash_experiments_chunks(user_movie_sets, hash_types, num_hashes_list, chunk_size=1000, sample_size=100, thresholds=None):
    max_movie_id = max(max(user_set) for user_set in user_movie_sets.values())
    user_ids = list(user_movie_sets.keys())
    experiment_results = []
    
    total_users = len(user_ids)
    print(f"Total number of users processed: {total_users}")
    
    for hash_type in hash_types:
        for num_hashes in num_hashes_list:
            hash_functions = generate_hash_functions(num_hashes, max_movie_id, hash_type)
            global_errors = []
            global_threshold_counts = {threshold: 0 for threshold in thresholds}
            
            for chunk_start in range(0, total_users, chunk_size):
                chunk_end = min(chunk_start + chunk_size, total_users)
                user_chunk_ids = user_ids[chunk_start:chunk_end]
                user_chunk_sets = {user: user_movie_sets[user] for user in user_chunk_ids}
                
                signature_matrix, user_list = compute_minhash_signature_chunk(user_chunk_sets, hash_functions)
                
                mean_error, chunk_threshold_counts = evaluate_minhash_accuracy(
                    signature_matrix, user_list, user_chunk_sets, sample_size, thresholds
                )
                global_errors.append(mean_error)
                
                for threshold in thresholds:
                    global_threshold_counts[threshold] += chunk_threshold_counts[threshold]
            
            global_mean_error = np.mean(global_errors)
            experiment_results.append({
                'hash_type': hash_type,
                'num_hashes': num_hashes,
                'global_mean_error': global_mean_error,
                'global_threshold_counts': global_threshold_counts
            })
    
    return experiment_results

hash_types = ['linear', 'modular', 'universal']
num_hashes_list = [50, 100, 200]
chunk_size = 1000  
sample_size = 30000  
thresholds = [0.2, 0.4, 0.6, 0.8]

results = run_minhash_experiments_chunks(user_movie_sets, hash_types, num_hashes_list, chunk_size, sample_size, thresholds)

print("\n--- Aggregate Final Results ---")
for result in results:
    print(f"Hash Type: {result['hash_type']}, Num Hashes: {result['num_hashes']}, "
          f"Global Mean Error: {result['global_mean_error']:.4f}")
    for threshold, count in result['global_threshold_counts'].items():
        print(f"  Pairs with Jaccard similarity >= {threshold}: {count}")

Total number of users processed: 138493

--- Aggregate Final Results ---
Hash Type: linear, Num Hashes: 50, Global Mean Error: 0.0189
  Pairs with Jaccard similarity >= 0.2: 97690
  Pairs with Jaccard similarity >= 0.4: 8277
  Pairs with Jaccard similarity >= 0.6: 509
  Pairs with Jaccard similarity >= 0.8: 14
Hash Type: linear, Num Hashes: 100, Global Mean Error: 0.0139
  Pairs with Jaccard similarity >= 0.2: 97678
  Pairs with Jaccard similarity >= 0.4: 8263
  Pairs with Jaccard similarity >= 0.6: 473
  Pairs with Jaccard similarity >= 0.8: 9
Hash Type: linear, Num Hashes: 200, Global Mean Error: 0.0094
  Pairs with Jaccard similarity >= 0.2: 97043
  Pairs with Jaccard similarity >= 0.4: 8281
  Pairs with Jaccard similarity >= 0.6: 458
  Pairs with Jaccard similarity >= 0.8: 11
Hash Type: modular, Num Hashes: 50, Global Mean Error: 0.0193
  Pairs with Jaccard similarity >= 0.2: 97380
  Pairs with Jaccard similarity >= 0.4: 8067
  Pairs with Jaccard similarity >= 0.6: 457
  Pairs with

# Report on MinHash Experiment Results

The experiment aimed to evaluate the effectiveness of MinHash signatures in estimating Jaccard similarity between pairs of users in a movie ratings dataset. To achieve this, three different types of hash functions—linear, modular, and universal—were combined with three configurations for the number of hash functions (50, 100, 200). The approach also utilized chunking to process the large dataset efficiently. The analysis focused on two main aspects: the calculation of the global mean error relative to the actual Jaccard similarity and the count of user pairs exceeding specific similarity thresholds (0.2, 0.4, 0.6, 0.8).

Key Findings

The results show that the global mean error decreases as the number of hash functions increases. For instance, with the linear hash type, the mean error drops from 0.0189 using 50 hash functions to 0.0094 with 200 hash functions. This behavior confirms that a larger number of hash functions improves the precision of Jaccard similarity estimates, reducing variability in the results. Similar trends were observed for the modular and universal hash types, although the latter generally exhibited higher mean errors, especially with fewer hash functions.

An interesting aspect highlighted by the analysis is the distribution of pairs exceeding the similarity thresholds. At the 0.2 threshold, a significant percentage of pairs meet the criterion, with counts consistently exceeding 97,000 pairs across most configurations. However, the number of pairs decreases drastically as the threshold increases. For example, at the 0.4 threshold, the count drops to around 8,000 pairs, and further to fewer than 500 for thresholds above 0.6. This pattern reflects the nature of the dataset: users sharing highly similar tastes are rare, while most pairs share only a limited fraction of rated movies.

The comparison among the three hash types revealed that linear hash functions are the most reliable in terms of stability and accuracy, with lower mean errors and more consistent results across the similarity thresholds. The modular hash type showed comparable performance, with a slight drop in precision for some configurations. The universal hash type, however, was less consistent, showing greater variability in results and generally higher mean errors, making it less suitable for contexts requiring precise estimates.

Observations on the Methodology

The use of chunking allowed for efficient processing of the large dataset, avoiding memory issues while maintaining result accuracy. Additionally, the random sampling of pairs for error calculation and threshold counts ensured a balanced representation of the dataset, reducing computational costs.

The analysis confirmed that the number of hash functions significantly impacts accuracy: configurations with 100 or 200 hash functions represent a good balance between precision and computational cost. Moreover, the introduction of thresholds provided valuable insights into the similarity distribution within the dataset, highlighting that highly similar tastes are rare.

Conclusion

In conclusion, the linear hash type with 100 or 200 hash functions emerges as the optimal choice, offering the best trade-off between accuracy and performance. The results also emphasize that most user pairs in the dataset exhibit limited similarity, with only a few pairs sharing highly similar tastes. This reflects the diversity in user ratings and provides useful insights for optimizing similarity-based recommendation systems. The approach proved robust and scalable, representing a valid methodology for future analyses on large-scale datasets.