## Laboratorium 4
#### Bartosz Hanc

In [9]:
import numpy as np
from tabulate import tabulate
from typing import Any, Sequence
from functools import partial


1. Zaimplementuj przynajmniej 3 "metryki" spośród wymienionych: cosinusowa, LCS,
   DICE, euklidesowa, Levenshteina.

In [3]:
def preprocess(text: str) -> str:
    # Your code here: Convert the text to lowercase. Remove all punctuation
    # marks;

    from string import punctuation

    text = text.lower().translate(str.maketrans("", "", punctuation))

    return text


def text_to_vec(docs: list[str]) -> list[list[int]]:
    # Your code here: Convert documents to numerical vectors. Preprocess
    # documents with the preprocess() function. Represent documents as vectors
    # of word frequencies, you will need to extract a vocabulary from all the
    # documents.
    freq_vecs = []

    from functools import reduce

    bag_of_words = {
        word: 0 for word in preprocess(reduce(lambda x, y: x + " " + y, docs)).split()
    }

    for text in docs:
        for word in bag_of_words:
            bag_of_words[word] = 0

        for word in preprocess(text).split():
            bag_of_words[word] += 1

        v = list(bag_of_words.values())
        freq_vecs.append(np.array(v))

    return freq_vecs


def levenshtein(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
    # Your code here:
    # Implement the Levenshtein distance calculation.
    # It should work on any sequences, not only on strings.

    m, n = len(seq_a), len(seq_b)
    d = [[None for _ in range(n)] for _ in range(m)]

    for i in range(m):
        d[i][0] = i

    for j in range(n):
        d[0][j] = j

    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if seq_a[i] == seq_b[j] else 1
            min_cost = min(
                d[i - 1][j] + 1,  # deletion
                d[i][j - 1] + 1,  # insertion
                d[i - 1][j - 1] + cost,  # change
            )
            d[i][j] = min_cost

    return d[m - 1][n - 1]


def metric(x, y, type="euclidean"):
    match type:

        case "euclidean":
            vec_a, vec_b = text_to_vec([x, y])

            return np.linalg.norm(vec_a - vec_b)

        case "levenshtein":
            seq_a = preprocess(x).split()
            seq_b = preprocess(y).split()

            return levenshtein(seq_a, seq_b)

        case "damerau":
            raise Exception("Not yet implemented")

        case _:
            raise Exception(f"Unimplemented metric of type {type}")


2. Zaimplementuj przynajmniej 1 sposoby oceny jakości klasteryzacji (np. indeks
   Daviesa-Bouldina).

In [4]:
def centroids(clusters, d):
    """Calculates centroids of a given clusterization.

    Args:
        clusters (dict):  dict of clusters
        d (function: str, str -> float): distance function between strings
    
    Returns:
        dict: dict of calculated centroids
    """
    centroids = {}

    for cluster, docs in clusters.items():
        centroid, min_dist = None, float("inf")

        for text in docs:
            dist = sum([d(text, other) for other in docs])

            if dist < min_dist:
                centroid, min_dist = text, dist

        centroids[cluster] = centroid

    return centroids


def DB_index(clusters, d):
    """Calculates Davies-Bouldin index of a given clusterization.

    Args:
        clusters (dict): dict of clusters
        d (function: str, str -> float): distance function between strings

    Returns:
        float: DB index
    """
    C, sigma = centroids(clusters, d), {}

    for cluster, docs in clusters.items():
        sigma[cluster] = 0

        for text in docs:
            sigma[cluster] += d(C[cluster], text)

        sigma[cluster] /= len(docs)

    db_index = 0
    
    for i, cl_i in enumerate(clusters):
        max_ratio = 0
        for j, cl_j in enumerate(clusters):
            if i == j:
                continue

            max_ratio = max(
                max_ratio,
                (sigma[cl_i] + sigma[cl_j]) / d(C[cl_i], C[cl_j]),
            )

        db_index += max_ratio

    db_index /= len(clusters)

    return db_index


3. Stwórz stoplistę najczęściej występujących słów i zastosuj ją jako
   pre-processing dla nazw. Algorytmy klasteryzacji powinny działać na dwóch
   wariantach: z pre-processingiem i bez pre-processingu.

In [121]:
from random import randint, sample


def k_means(k, docs, d, iters=2):
    """Implements k-means clustering algorithm

    Args:
        k (int): number of clusters
        docs (list[string]): list of texts
        d (function): metric function

    Returns:
        dict: dict representing found clusterization
    """
    assert k < len(docs), "k should be smaller than len(docs)"

    clusters = {i: set() for i in range(k)}
    for text in docs:
        clusters[randint(0, k - 1)].add(text)

    for _ in range(iters):
        print(f"Iteration: {_}")
        print("Calculating  centroids...")
        C = centroids(clusters=clusters, d=d)

        print("Clearing clusters...")
        clusters = {i: set() for i in range(k)}

        print("Reassigning to clusters...")
        for text in docs:
            key, _ = min(
                C.items(), key=lambda p: float("inf") if p[1] == None else d(text, p[1])
            )
            clusters[key].add(text)

    return clusters


In [161]:
k = 200
'''
docs = [
    "A brown fox was jumped over by white wolf",
    "White wolf jumped over a brown fox",
    "Apple fell on the ground",
    "Apple fell on Isaac Newton's head",
    "Some random message",
    "Message that is random",
    "Hello from future",
]
'''
d = partial(metric, type="euclidean")
k_means(k, docs, d, iters=2)


Iteration: 0
Calculating  centroids...
Clearing clusters...
Reassigning to clusters...
Iteration: 1
Calculating  centroids...


KeyboardInterrupt: 

4. Wykonaj klasteryzację zawartości załączonego pliku (lines.txt) przy użyciu
   metryk zaimplementowanych w pkt. 1. Każda linia to adres pocztowy firmy,
   różne sposoby zapisu tego samego adresu powinny się znaleźć w jednym
   klastrze.

In [153]:
docs = open("lines.txt", "r").readlines()
preprocessed_docs = []
for text in docs:
    preprocessed_docs.append(preprocess(text))
print(docs)
print(preprocessed_docs)


['/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA\n', "''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611\n", "''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669\n", "''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--\n", "''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND\n", "'MASTER PLUS CO.,LTD.' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939\n", '"2TIGERS GROUP LIMITED"  ROOM 504 JINSHAZHOU SHANGSHUI ROAD,  GUANGZHOU 510160\n', '"ALDETRANS" LLC, 105066, MOSCOW, RUSSIA, TOKMAKOV LANE, 11. TEL:+7(495)641-03-89\n', '"A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961\n', '"ALISA" LTD, 1/5 Derbenevskaya str., Moscow, Russia Tel./Fax: (495) 987-13-07 postal code: 115114\

5. Porównaj jakość wyników sposobami zaimplementowanymi w pkt. 2.

In [8]:
lines = open("clusters.txt", "r").read().splitlines()
lines = list(filter(lambda text: text != "", lines))

N = len(list(filter(lambda text: text == "##########", lines)))
model_clusters = {i: [] for i in range(N)}

i = 0
for text in lines:
    if text == "##########":
        if len(model_clusters[i]) < 5:
            del model_clusters[i]
        i += 1
    else:
        model_clusters[i].append(text)


d = partial(metric, type="euclidean")
DB_index(model_clusters, d)


1.855748134208649