In [27]:
%run Shingling.ipynb
%run CompareSets.ipynb
%run MinHashing.ipynb
%run CompareSignatures.ipynb
%run LSH.ipynb

In [28]:
import os
import math
import time
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row

from functools import partial

class TextualSimilarity:
    
    def __init__(self, directory, k_shingle, k_sign, band, r, threshold):
        self.docs_names = dict()
        self.docs = self._get_docs(directory)
        self.k_shingle = k_shingle
        self.k_sign = k_sign
        self.band = band
        self.r = r
        self.threshold = threshold
        
        
    def _get_docs(self, directory):
        # Returns each document as a string with its coresponding ID
        docs = []
        index = 0
        for filename in os.listdir(directory):
            with open(directory + "/" + filename, 'r') as myfile:
                doc = myfile.read().replace('\n', '')
                docs.append((index, doc))
                self.docs_names[index] = filename
                index += 1
        return docs
    
    
    def _print_results(self, result):
        message = ""
        for candidate in result:
            # Setting the docs and their similarity
            doc1 = candidate[0][0]
            doc2 = candidate[0][1]
            similarity = str(candidate[1])
            
            # Generating and printing the result
            message += "Similarity for documents " + self.docs_names[doc1] 
            message += " and " + self.docs_names[doc2] 
            message += ": " + similarity + "\n"
            message += "\t- " + self.docs[doc1][1] + "\n"
            message += "\t- " + self.docs[doc2][1] + "\n"
        print(message)
    
    
    def execute(self, sc):
        # Measure execution time
        start_time = time.time()
        
        # Generate RDD from the documents
        docs = sc.parallelize(self.docs)
        
        # Execute Shingling for each document
        shingling = Shingling(self.k_shingle)
        shinglings = docs.map(shingling.shingling)
        
        # Creating signatures from the shinglings
        min_hashing = MinHashing(k = self.k_sign)
        signatures = shinglings.map(min_hashing.signature).values().collect()
        
        # Create LSH object to get candidate pairs
        lsh = LSH(self.band, self.r)
        split = np.hsplit(np.stack(signatures), lsh.b)
        pairs = sc.parallelize(split).flatMap(lsh.pairs).distinct()

        # Compare the signatures of the selected pairs
        comparer = CompareSignatures(signatures)
        candidates = pairs.filter(lambda x: comparer.compare(x) >= lsh.t)
        
        # Calculate Jaccard Similarity for the candidate pairs
        jaccard = CompareSets(signatures)
        similarities = candidates.map(jaccard.calculate).filter(lambda x: x[1] >= self.threshold)

        # Printing the results in order and execution time
        self._print_results(similarities.collect())
        print("Execution time: " + str(time.time() - start_time) + "s", sep = '')

In [29]:
# Start spark
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('IPDE') \
    .getOrCreate()
sc = spark.sparkContext

# Execute test
similarity = TextualSimilarity("../data/", 5, 100, 20, 5, 0.5)
similarity.execute(sc)

Similarity for documents reviewcy and reviewck: 0.5952380952380952
	-  the sound quality is very good .
	-  It is compact and the sound quality is very good .

Execution time: 0.9237532615661621s
