In [14]:
#Algorithm to calculate the jaccard similarity, minhash and local senstive hanshing. 
import pandas as pd
import binascii
import random


# load airline reviews
reviews_all = pd.read_csv("data/skytrax-reviews-dataset-master/data/airline.csv").content

# num of reviews to analyse
numReviews = 1000

# subset of reviews
reviews = reviews_all[0:numReviews]

del reviews_all



In [15]:
################
# Shingling
################

# set shingle character lenght
k_shingle = 10

allShingleSets = {}


for index, review in enumerate(reviews):
    # Contain all unique (no duplicate) shingles of the review
    shingleSet = set()
    for i in range(len(review) - k_shingle + 1):
        shingle = review[i: i + k_shingle].encode()
        
        # hashing to 32-bit integer
        crc = binascii.crc32(shingle) & 0xffffffff
        # add hash value to document shingleset if not present yet
        shingleSet.add(crc)
    
    # store all shingle sets from each review together
    allShingleSets[index] = shingleSet

In [16]:
#####################
# Jaccard Similarity
#####################
jaccardSimilarities = {}
for i, shingleSet1 in allShingleSets.items():
    set1 = shingleSet1
    for j in range(i+1, len(allShingleSets)):
        set2 = allShingleSets[j]
    
        # calculate and store jaccard similarities
        jaccardSimilarities[(i, j)] = (len(set1.intersection(set2))/len(set1.union(set2)))

In [25]:

for index, value in jaccardSimilarities.items():
    if value >= 0.4:
        print(index, value)
        print(reviews[index[0]])
        print("--------")
        print(reviews[index[1]])
        print()

(222, 236) 1.0
I flew from Chicago O'Hare to Dublin and from Dublin to Amsterdam and Amsterdam back to Dublin and Dublin back to O'Hare and I must say I was pleased with the airline. The food was good the entertainment was good and the leg room on the flights was great. Only issue I did have was from Amsterdam to Dublin because we arrived 30 minutes left and nearly missed our connecting flight from Dublin to O'Hare however we made it on time with all of our belongings. Will definitely use Aer Lingus again on any future trips to Ireland.
--------
I flew from Chicago O'Hare to Dublin and from Dublin to Amsterdam and Amsterdam back to Dublin and Dublin back to O'Hare and I must say I was pleased with the airline. The food was good the entertainment was good and the leg room on the flights was great. Only issue I did have was from Amsterdam to Dublin because we arrived 30 minutes left and nearly missed our connecting flight from Dublin to O'Hare however we made it on time with all of our b

In [22]:
################
# MinHash
################
numHashes = 10
maxShingle = 2**32-1
maxShingle

maxPrime = 4294967311

# hash function takes form h(x) = (a*x + b) mod c
# a anb b are random coefficients generated by function below, 
# c is equal to the first prime number outside our hash bounds
def randomCoefficients(numHashes):
    # list all coefficients
    coeffList = []

    for i in range(0, numHashes):
        randCoeff = random.randint(0, maxShingle)

        # ensure unique coefficients
        while randCoeff in coeffList:
            randCoeff = random.randint(0, maxShingle)

        coeffList.append(randCoeff)
    return coeffList

coeffA = randomCoefficients(numHashes)
coeffB = randomCoefficients(numHashes)

# signature vectors
signatures = []

# hashfunction on shingle and take lowest value
for index, shingleSet in allShingleSets.items():
    
    sig = []
    
    for i in range(0, numHashes):
        minHash = maxPrime + 1
        for shingle in shingleSet:
            hashCode = (coeffA[i]*shingle + coeffB[i]) % maxPrime
            
            if hashCode < minHash:
                minHash = hashCode
        
        sig.append(minHash)
    
    signatures.append(sig)

    

In [23]:
#####################
# Compare signatures
#####################

est_jacc = {}

# evaluate each signature with another
for i in range(0, numReviews):
    sig1 = signatures[i]
    
    for j in range (i+1, numReviews):
        sig2 = signatures[j]
        
        # count the number of equivalent values in signature
        count = 0
        
        for k in range(0, numHashes):
            if sig1[k] == sig2[k]:
                count += 1
        
        # calculate ratio of matched values in signatures
        est_jacc[(i, j)] = count/numHashes
                
        

In [26]:
for index, value in est_jacc.items():
    if value >= 0.4:
        print(index, value)
        print(reviews[index[0]])
        print("--------")
        print(reviews[index[1]])
        print()

(66, 927) 0.4
Multiple trip with AE between SKG-FRA and MUC. Aircraft's were always clean service was fine food above average and quite good compared to other regular European airlines and the staff was always courteous and friendly. There were only couple times out of the last maybe 30 round trips the flight was delayed and that happened summertime when the European skies are congested. I have great respect for this airline and how nicely developed to become one of the best regional airlines of Europe.
--------
Mauritius to Reunion return. Onward flight was on a Boeing 737-800 with only a 20% load so we had enough space during that 20 mins flight. Service was a cookie and juice which was nice. Return flight was on a ATR 72 aircraft with the same service. Crew was friendly and helpful. Flights and bags where on time. Would fly them again even ticket price is quite high.

(182, 192) 0.5
Athens - Brussels - Athens. Nice service friendly staff clean and comfortable seats. Good food on tim

In [15]:
# ########
# # LSH
# ########
# bands = 10
# rows = 5

# lsh = []
# for band in range(0, bands):
#     lsh.append(hash(signatures[band*rows:band*rows+rows]))

10