In [None]:
import numpy as np
import os
import pandas as pd
import random
import re
import zipfile

from collections import defaultdict, namedtuple
from itertools import count

from datasketch import MinHashLSHForest, MinHash

In [None]:
texts = ["""A strong earthquake has struck central Mexico, killing more than 200 people and toppling dozens of buildings
            in the capital, Mexico City. More than 20 children died and 30 are missing after a school collapsed,
            President Enrique Pena Nieto said. The 7.1 magnitude quake also caused major damage in neighbouring states.
            It struck shortly after many people had taken part in an earthquake drill, exactly 32 years after a quake
            killed thousands in Mexico City.""",
         """A strong earthquake has struck central Australia, killing more than 123 people and toppling dozens of buildings
            in the capital, Canberra. More than 25 children died and 100 are missing after a school collapsed,
            President Arnold Schwarzenegger said. The 6.2 magnitude quake also caused major damage in neighbouring states.
            It struck shortly after many people had taken part in an earthquake drill, exactly 20 years after a quake
            killed thousands in Kazakstan.""",
         """Spain's Guardia Civil police have detained 14 Catalan officials and raided regional government ministries
            involved in organising a banned independence vote. Tensions were already high before Josep Maria Jove,
            number two in the Catalan vice-presidency, and others were held. Thousands of Catalans took to the streets
            in protest and the regional leader complained of a power grab. Spanish Prime Minister Mariano Rajoy said the
            state had been forced to act."""]

In [None]:
def shingling(text, k):
    length = len(text)
    if length < k:
        return frozenset({})
    return frozenset({text[ix:ix+k] for ix in range(length-k+1)})


def clean_text(text):
    return re.sub("\s+", " ", text).strip()


def read_document(file_path, k):
    with open(file_path, "r", encoding="ISO-8859-1") as article:
        text = article.read().replace("\n", " ")
    cleaned_text = clean_text(text)
    return shingling(cleaned_text.lower(), k)

In [None]:
class CharacteristicMatrixCreator:
    def __init__(self):
        self._encoding = dict()
        self._decoding = dict()
        self._code_generator = count()

    def _add_item_to_encoding(self, shingle):
        next_code = next(self._code_generator)
        self._encoding[shingle] = next_code
        self._decoding[next_code] = shingle
        return next_code

    def _encode(self, shingle):
        if shingle in self._encoding:
            return self._encoding[shingle]
        return self._add_item_to_encoding(shingle)

    def decode(self, number):
        return self._decoding[number]

    def create_characteristic_matrix(self, shingles, labels):
        all_shingles = frozenset.union(*shingles)        
        data_as_columns = [
            {self._encode(shingle): 1 if shingle in text_shingles else 0 for shingle in all_shingles}
            for text_shingles in shingles
        ]
        return pd.DataFrame(data_as_columns, index=labels).transpose()

In [None]:
cmc = CharacteristicMatrixCreator()


df = cmc.create_characteristic_matrix(
    [frozenset({"ab", "bc", "xy", "tv"}), 
     frozenset({"ef", "bc", "xy", "za", "cd"}),
     frozenset({"xy", "bc", "ab", "km"})
    ], ["first", "second", "third"])

In [None]:
df

In [None]:
def calc_jaccard_similarity(s, t):
    return len(s & t) / len(s | t)

      
def find_index_of_first_nonzero_row(characteristic_vector):
    return characteristic_vector[characteristic_vector==1].index[0]

In [None]:
def minhashing(characteristic_matrix, n):
    rng = np.random.RandomState(123)
    _, ncols = characteristic_matrix.shape
    minhash_signatures = [[0]*n for _ in range(ncols)]
    for j in range(n):
        characteristic_matrix = characteristic_matrix.sample(frac=1, random_state=rng)
        for col, colname in enumerate(characteristic_matrix.columns):
            index = find_index_of_first_nonzero_row(characteristic_matrix[colname])
            minhash_signatures[col][j] = index
    return pd.DataFrame(data=minhash_signatures, index=characteristic_matrix.columns).transpose()

In [None]:
minhashing(df, 10)

In [None]:
def compute_similarity(shingles, signatures, i, j, n):
    similarity = calc_jaccard_similarity(shingles[i], shingles[j])
    approximate_similarity = sum([1 for elem1, elem2 
                                  in zip(signatures.iloc[:, i], signatures.iloc[:, j]) 
                                  if elem1 == elem2]) / n
    return similarity, approximate_similarity


def create_shingles_from_text(text, k):
    cleaned_text = clean_text(text)
    return shingling(cleaned_text, k)

In [None]:
shingles = [create_shingles_from_text(text, 5) for text in texts]
cmc = CharacteristicMatrixCreator()
characteristic_matrix = cmc.create_characteristic_matrix(shingles, ['A', 'B', 'C'])

n = 100
signatures = minhashing(characteristic_matrix, n)

In [None]:
signatures[:10]

In [None]:
jaccard_similarity, approximate_similarity = compute_similarity(shingles, signatures, 0, 1, n)


print(jaccard_similarity)
print(approximate_similarity)

In [None]:
!wget http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip -O /tmp/bbc-fulltext.zip

In [None]:
zip_ref = zipfile.ZipFile('/tmp/bbc-fulltext.zip', 'r')
zip_ref.extractall('/tmp/')
zip_ref.close()

In [None]:
base_dir = '/tmp/bbc/'


ShingledDocument = namedtuple("ShingledDocument", ["name", "shingles"])


def get_subfolders(path):
    return [f.path for f in os.scandir(path) if f.is_dir()]


def get_filenames(path):
    return [f.path for f in os.scandir(path) if f.is_file()]

In [None]:
labels = get_subfolders(base_dir)

print(labels)

In [None]:
def read_document(file_path):
    with open(file_path, "r", encoding="ISO-8859-1") as article:
        text = article.read().replace("\n", " ")
    cleaned_text = clean_text(text)
    return cleaned_text


def iterate_over_documents(path, k):
    subfolders = get_subfolders(path)
    for subfolder in subfolders:
        label = subfolder.split('/')[-1]
        file_paths = get_filenames(subfolder)
        for file_path in file_paths:
            file_name = file_path.split('/')[-1]
            content = read_document(file_path)
            yield label, file_name, content

In [None]:
def create_document_shingles(path, k):
    documents = []
    document_generator = iterate_over_documents(path, k)
    for label, file_name, text in document_generator:
        shingles = create_shingles_from_text(text, k)
        documents.append((label, file_name, shingles))
    return documents

In [None]:
documents = create_document_shingles(base_dir, k=5)

In [None]:
len(documents)

In [None]:
documents[0]

In [None]:
class LSHForest:
    def __init__(self, nr_permutations):
        self._nr_permutations = nr_permutations
        
    def build_lsh_forest(self, documents):
        forest = MinHashLSHForest(num_perm=self._nr_permutations)
        for topic, name, word_counts in documents:
            document_name = f'{topic}/{name}'
            minhash = MinHash(num_perm=self._nr_permutations)
            for word in word_counts:
                minhash.update(word.encode("utf-8"))
            forest.add(document_name, minhash)
        forest.index()
        return forest
    
    def get_top_k_most_similar_documents(self, forest, query_document, k):
        minhash = MinHash(num_perm=self._nr_permutations)
        for word in query_document:
            minhash.update(word.encode("utf-8"))
        return forest.query(minhash, k)

In [None]:
lsh_forest = LSHForest(nr_permutations=128)

In [None]:
forest = lsh_forest.build_lsh_forest(documents)

In [None]:
path = '/home/tamas/Programok/Repositories/MachineLearningCourse/homework/bbc_near_duplicate_2.txt'


text = read_document(path)
query_shingles = create_shingles_from_text(text, k=5)

In [None]:
text

In [None]:
res = lsh_forest.get_top_k_most_similar_documents(forest, query_shingles, 3)

In [None]:
res

In [None]:
for fname in res:
    doc = read_document(f'{base_dir}{fname}')
    print(doc)
    print('\n')