In [79]:
%run Shingling.ipynb
%run CompareSets.ipynb
%run MinHashing.ipynb
%run LSH.ipynb

In [84]:
import os
import math
import time
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row

from functools import partial

class TextualSimilarity:
    
    def __init__(self, directory, k_shingle, k_sign, band, r, threshold):
        self.docs_names = dict()
        self.docs = self._get_docs(directory)
        self.k_shingle = k_shingle
        self.k_sign = k_sign
        self.band = band
        self.r = r
        self.threshold = threshold
        
        
    def _get_docs(self, directory):
        # Returns each document as a string with its coresponding ID
        docs = []
        index = 0
        for filename in os.listdir(directory):
            with open(directory + "/" + filename, 'r') as myfile:
                doc = myfile.read().replace('\n', '')
                docs.append((index, doc))
                self.docs_names[index] = filename
                index += 1
        return docs
    
    
    def _print_results(self, similarities):
        message = ""
        for pair, similarity in sorted(similarities.items(), key = lambda x: x[1]):
            message += "Similarity for documents " + self.docs_names[pair[0]] 
            message += " and " + self.docs_names[pair[1]] 
            message += ": " + str(similarity) + "\n"
        print(message)
    
    
    def execute(self, sc):
        # Measure execution time
        start_time = time.time()
        
        # Generate RDD from the documents
        docs = sc.parallelize(self.docs)
        
        # Execute Shingling for each document
        shingling = Shingling(self.k_shingle)
        shinglings = docs.map(shingling.shingling)
        
        # Creating signatures from the shinglings
        min_hashing = MinHashing(k = self.k_sign)
        signatures = shinglings.map(min_hashing.signature).values().collect()
        
        # Create LSH object to get candidate pairs
        lsh = LSH(self.band, self.r)
        split = np.hsplit(np.stack(signatures), lsh.b)
        pairs = sc.parallelize(split).map(lsh.pairs)
        candidates = lsh.candidates(pairs.collect())

        # Calculate Jaccard Similarity for the candidate pairs
        jaccard = CompareSets(signatures, self.threshold)
        similarities = jaccard.calculate(candidates)
        
        # Printing the results and execution time
        self._print_results(similarities)
        print("Execution time: " + str(time.time() - start_time) + "s", sep = '')

In [85]:
# Start spark
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('IPDE') \
    .getOrCreate()
sc = spark.sparkContext

# Execute test
similarity = TextualSimilarity("../data/", 5, 100, 50, 2, 0.4)
similarity.execute(sc)

Similarity for documents b.txt and c.txt: 0.5390625
Similarity for documents c.txt and a.txt: 0.5555555555555556
Similarity for documents c.txt and d.txt: 0.5555555555555556
Similarity for documents b.txt and a.txt: 0.97
Similarity for documents b.txt and d.txt: 0.97
Similarity for documents a.txt and d.txt: 1.0

Execution time: 0.9812185764312744s
