#### Notebook for testing purposes

##### Imports and global parameters

In [1]:
import configparser  # for reading the parameters file
import sys  # for system errors and printouts
from pathlib import Path  # for paths of files
import os  # for reading the input data
import time  # for timing
import pandas as pd
import random 

In [2]:
# Global parameters
parameter_file = 'default_parameters.ini'  # the main parameters file
# the main path where all the data directories are
data_main_directory = Path('data')
# dictionary that holds the input parameters, key = parameter name, value = value
parameters_dictionary = dict()
# dictionary of the input documents, key = document id, value = the document
document_list = dict()

#### Pre-implemented methods

In [3]:

# DO NOT CHANGE THIS METHOD
# Reads the parameters of the project from the parameter file 'file'
# and stores them to the parameter dictionary 'parameters_dictionary'
def read_parameters():
    config = configparser.ConfigParser()
    config.read(parameter_file)
    for section in config.sections():
        for key in config[section]:
            if key == 'data':
                parameters_dictionary[key] = config[section][key]
            elif key == 'naive':
                parameters_dictionary[key] = bool(config[section][key])
            elif key == 't':
                parameters_dictionary[key] = float(config[section][key])
            else:
                parameters_dictionary[key] = int(config[section][key])


# DO NOT CHANGE THIS METHOD
# Reads all the documents in the 'data_path' and stores them in the dictionary 'document_list'
def read_data(data_path):
    for (root, dirs, file) in os.walk(data_path):
        for f in file:
            file_path = data_path / f
            doc = open(file_path).read().strip().replace('\n', ' ')
            file_id = int(file_path.stem)
            document_list[file_id] = doc


# DO NOT CHANGE THIS METHOD
# Calculates the Jaccard Similarity between two documents represented as sets
def jaccard(doc1, doc2):
    return len(doc1.intersection(doc2)) / float(len(doc1.union(doc2)))


# DO NOT CHANGE THIS METHOD
# Define a function to map a 2D matrix coordinate into a 1D index.
def get_triangle_index(i, j, length):
    if i == j:  # that's an error.
        sys.stderr.write("Can't access triangle matrix with i == j")
        sys.exit(1)
    if j < i:  # just swap the values.
        temp = i
        i = j
        j = temp

    # Calculate the index within the triangular array. Taken from pg. 211 of:
    # http://infolab.stanford.edu/~ullman/mmds/ch6.pdf
    # adapted for a 0-based index.
    k = int(i * (length - (i + 1) / 2.0) + j - i) - 1

    return k


# DO NOT CHANGE THIS METHOD
# Calculates the similarities of all the combinations of documents and returns the similarity triangular matrix
def naive():
    docs_Sets = []  # holds the set of words of each document

    for doc in document_list.values():
        docs_Sets.append(set(doc.split()))

    # Using triangular array to store the similarities, avoiding half size and similarities of i==j
    num_elems = int(len(docs_Sets) * (len(docs_Sets) - 1) / 2)
    similarity_matrix = [0 for x in range(num_elems)]
    for i in range(len(docs_Sets)):
        for j in range(i + 1, len(docs_Sets)):
            similarity_matrix[get_triangle_index(i, j, len(docs_Sets))] = jaccard(
                docs_Sets[i], docs_Sets[j])

    return similarity_matrix

##### Stuff to do before running own methods:

In [27]:
# Global parameters
parameter_file = 'default_parameters.ini'  # the main parameters file
# the main path where all the data directories are
data_main_directory = Path('data')
# dictionary that holds the input parameters, key = parameter name, value = value
parameters_dictionary = dict()
# dictionary of the input documents, key = document id, value = the document
document_list = dict()

read_parameters()
#print(parameters_dictionary['data'])

# Reading the data
print("Data reading...")
data_folder = data_main_directory / parameters_dictionary['data']
t0 = time.time()
read_data(data_folder)
document_list = {k: document_list[k] for k in sorted(document_list)}
t1 = time.time()
print(len(document_list), "documents were read in", t1 - t0, "sec\n")

# Naive
naive_similarity_matrix = []
if parameters_dictionary['naive']:
    print("Starting to calculate the similarities of documents...")
    t2 = time.time()
    naive_similarity_matrix = naive()
    t3 = time.time()
    print("Calculating the similarities of", len(naive_similarity_matrix),
            "combinations of documents took", t3 - t2, "sec\n")


Data reading...
5 documents were read in 0.0018680095672607422 sec

Starting to calculate the similarities of documents...
Calculating the similarities of 10 combinations of documents took 0.0 sec



#### Methods that we have to implement

- [x] K-shingles
- [ ] Signature sets
- [ ] Min hash
- [ ] LSH
- [ ] Candidate similarities
- [ ] Return results
- [ ] Count false neg and pos

#### Notater:

k shingles: gjøre på nytt (fikse)
            finner set med k-ord/karakterer i alle dokument og legger til i k-shingles

signature matrix:   lager liste med hash verider som representerer kshingles fra i dokumentene
                    legge til hashing

min hash            a b p = primtall kanskje funker

lsh forklaring av maggy og pp

fortsettelse kommer:


In [25]:
# METHOD FOR TASK 1
# Creates the k-Shingles of each document and returns a list of them
def k_shingles():
    docs_k_shingles = set()  # holds the k-shingles of each document
    # implement your code here

    # Get the value k from the parameters dictionary
    k = parameters_dictionary.get("k")

    # Iterate through the documents in document list
    for key in document_list:
        
        document = document_list[key]
        words = document.split()

        for index in range(len(document) - k + 1):
            shingle = words[index:index + k]
            shingle = ' '.join(shingle)

            if shingle not in docs_k_shingles:
                docs_k_shingles.add(shingle)
            else:
                del shingle
                index -= 1
    return list(docs_k_shingles)


In [98]:
def k_shingles2():
    l = []
    docs_k_shingles = []  # holds the k-shingles of each document
    # implement your code here

    # Get the value k from the parameters dictionary
    k = parameters_dictionary.get("k")

    for key in document_list:
        document = document_list[key]
        words = document.split()
        
        #print(words)
        #print(len(words))
        shingles_in_doc = []
        for i in range(len(words)-k+1):
            #print(words[i])
            
            shingle = words[i:i + k]
            #shingle = ' '.join(shingle)
            
            #print(shingle)
            shingle = ' '.join(shingle)
            #print("shingle: ",shingle)

            #print(shingle not in docs_k_shingles)
            #shingles_in_doc.append(shingle)
            
            if shingle not in shingles_in_doc:
                #print(shingle not in docs_k_shingles)
                shingles_in_doc.append(shingle)

            #print(shingles_in_doc) 
        #print(len(shingles_in_doc))
        docs_k_shingles.append(shingles_in_doc)

        #print("\n\n")
    


    return docs_k_shingles

docs_k_shingles = k_shingles2()

In [99]:
print(len((docs_k_shingles[0][0])))
print((docs_k_shingles))

22
[['the cat plays with the', 'cat plays with the dog'], ['the dog plays with the', 'dog plays with the cat'], ['the boy plays with the', 'boy plays with the dog'], ['the cat eats a fish'], ['the dog eats a bone']]


In [95]:
sh = k_shingles2()
#print(type(sh))
print(sh)



[[['the', 'cat', 'plays', 'with', 'the'], ['cat', 'plays', 'with', 'the', 'dog']], [['the', 'dog', 'plays', 'with', 'the'], ['dog', 'plays', 'with', 'the', 'cat']], [['the', 'boy', 'plays', 'with', 'the'], ['boy', 'plays', 'with', 'the', 'dog']], [['the', 'cat', 'eats', 'a', 'fish']], [['the', 'dog', 'eats', 'a', 'bone']]]


In [7]:
#for i in range(1000):
i = 1
print(sh[i])

with the dog


In [8]:
# Make the document matrix
def make_document_matrix(document_collection, k_shingles_set):
    # Make a list of zeroes of length equal to the number of shingles
    all_zeroes_vector = [0 for _ in k_shingles_set]

    doc_matrix = pd.DataFrame({'d1': all_zeroes_vector,
                               'd2': all_zeroes_vector,
                               'd3': all_zeroes_vector,
                               'd4': all_zeroes_vector,
                               'd5': all_zeroes_vector,
                               'd6': all_zeroes_vector,
                               'd7': all_zeroes_vector,
                               'd8': all_zeroes_vector,
                               'd9': all_zeroes_vector,
                               'd10': all_zeroes_vector,
                               'shingle': [f"'{i}'" for i in k_shingles_set]})

    for shingle_index, shingle in enumerate(k_shingles_set):
        print("shingle index: ", type(shingle_index), "   shingle: ", type(shingle))

        for doc_index, doc in enumerate(document_collection):
            if shingle in doc:
                doc_matrix.iloc[shingle_index, doc_index] = 1

    return doc_matrix


# Make the document matrix for the documents
#document_matrix = make_document_matrix(document_list, sh)

# Display the document matrix
#display(document_matrix)

In [100]:
# METHOD FOR TASK 2
# Creates a signatures set of the documents from the k-shingles list
def signature_set(k_shingles):
    docs_sig_sets = []

    for key in document_list:
        document = document_list[key]
        document_signature = []
        for shingle in k_shingles:
            if shingle in document:
                document_signature.append(1)
            else:
                document_signature.append(0)
        docs_sig_sets.append(document_signature)
    return docs_sig_sets

In [159]:
"""
signature matrix:   
    lager liste med hash verider som representerer kshingles fra i dokumentene
    legge til hashing

"""
# METHOD FOR TASK 2
# Creates a signatures set of the documents from the k-shingles list
def signature_set_lang(shingles):
    #docs_sig_sets = []
    print(shingles)

    for key in range(len(document_list)):
        print("key ", key)
        #print(document_list[key])
        #print(len(document_list))



        #print(type(document_list))
        
        #for shingle in document_list
        document = document_list[key+1]
        print("doc: ",document)
        for i in range(len(shingles[key])):
            print(shingles[key][i])
       
        #for index in range(len(k_shingles_input)):
            #print("shingles", k_shingles_input[key-1])
        print("")
       
    #return docs_sig_sets

signature_set = signature_set_lang(docs_k_shingles)


[['the cat plays with the', 'cat plays with the dog'], ['the dog plays with the', 'dog plays with the cat'], ['the boy plays with the', 'boy plays with the dog'], ['the cat eats a fish'], ['the dog eats a bone']]
key  0
doc:  the cat plays with the dog
the cat plays with the
cat plays with the dog

key  1
doc:  the dog plays with the cat
the dog plays with the
dog plays with the cat

key  2
doc:  the boy plays with the dog
the boy plays with the
boy plays with the dog

key  3
doc:  the cat eats a fish
the cat eats a fish

key  4
doc:  the dog eats a bone
the dog eats a bone



In [109]:

sig = signature_set_lang(sh)

1
2
3
4
5


In [11]:
print(len(sig[0]))


25


In [12]:
def is_prime(n):
  if n == 2 or n == 3: return True
  if n < 2 or n%2 == 0: return False
  if n < 9: return True
  if n%3 == 0: return False
  r = int(n**0.5)
  # since all primes > 3 are of the form 6n ± 1
  # start with f=5 (which is prime)
  # and test f, f+2 for being prime
  # then loop by 6. 
  f = 5
  while f <= r:
    if n % f == 0: return False
    if n % (f+2) == 0: return False
    f += 6
  return True  

In [13]:
# METHOD FOR TASK 3
# Creates the minHash signatures after simulation of permutations
random.seed(11)

def minHash(document_vector):
    signature = []

    number_of_hashes = parameters_dictionary.get("permutations")
    
    # Generate random permutations
    permutations = []
    for i in range(number_of_hashes):
        permutation = list(range(len(document_vector)))
        random.shuffle(permutation)
        permutations.append(permutation)

    # Make the minhash signature
    for i in range(number_of_hashes):
        hash_values = []
        for permutation_value_pair in zip(permutations[i], document_vector):
            p, v = permutation_value_pair
            if v == 1:
                hash_values.append(p)
        signature.append(min(hash_values))

    return signature

In [None]:
# New minhash


In [14]:
minhash_signature_matrix = []
for i in range(len(sig)):
    minhash_signature_matrix.append(minHash(sig[i]))


In [18]:
print(len(minhash_signature_matrix))
print(sig[0])

5
[1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]


In [None]:
# METHOD FOR TASK 4
# Hashes the MinHash Signature Matrix into buckets and find candidate similar documents
def lsh(m_matrix):
    candidates = []  # list of candidate sets of documents for checking similarity

    # implement your code here

    return candidates

In [None]:
# METHOD FOR TASK 4
# Hashes the MinHash Signature Matrix into buckets and find candidate similar documents
def lsh(m_matrix):
    candidates = []  # list of candidate sets of documents for checking similarity

    """ 
    candidate pairs av dokumenter muligens like
    bruk rader r og bøtter buckets som inndata parametre 
    
    """

    # implement your code here
    r = parameters_dictionary.get("r")
    buckets = parameters_dictionary.get("buckets")
    


    return candidates