## Laboratorium 4
#### Bartosz Hanc

In [76]:
import numpy as np
from typing import Any, Sequence
from functools import partial


1. Zaimplementuj przynajmniej 3 "metryki" spośród wymienionych: cosinusowa, LCS,
   DICE, euklidesowa, Levenshteina.

In [107]:
def preprocess(text: str) -> str:
    # Your code here: Convert the text to lowercase. Remove all punctuation
    # marks;

    from string import punctuation

    text = text.lower().translate(str.maketrans("", "", punctuation))

    return text


def text_to_vec(docs: list[str]) -> list[list[int]]:
    # Your code here: Convert documents to numerical vectors. Preprocess
    # documents with the preprocess() function. Represent documents as vectors
    # of word frequencies, you will need to extract a vocabulary from all the
    # documents.
    freq_vecs = []

    from functools import reduce

    bag_of_words = {
        word: 0 for word in preprocess(reduce(lambda x, y: x + " " + y, docs)).split()
    }

    for text in docs:
        for word in bag_of_words:
            bag_of_words[word] = 0

        for word in preprocess(text).split():
            bag_of_words[word] += 1

        v = list(bag_of_words.values())
        freq_vecs.append(np.array(v))

    return freq_vecs


def levenshtein(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
    # Your code here:
    # Implement the Levenshtein distance calculation.
    # It should work on any sequences, not only on strings.

    m, n = len(seq_a), len(seq_b)
    d = [[None for _ in range(n)] for _ in range(m)]

    for i in range(m):
        d[i][0] = i

    for j in range(n):
        d[0][j] = j

    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if seq_a[i] == seq_b[j] else 1
            min_cost = min(
                d[i - 1][j] + 1,  # deletion
                d[i][j - 1] + 1,  # insertion
                d[i - 1][j - 1] + cost,  # change
            )
            d[i][j] = min_cost

    return d[m - 1][n - 1]


def metric(x, y, type="euclidean"):
    match type:

        case "euclidean":
            vec_a, vec_b = text_to_vec([x, y])

            return np.linalg.norm(vec_a - vec_b)

        case "levenshtein":
            seq_a = preprocess(x).split()
            seq_b = preprocess(y).split()

            return levenshtein(seq_a, seq_b)

        case "damerau":
            raise Exception("Not yet implemented")

        case _:
            raise Exception(f"Unimplemented metric of type {type}")


2. Zaimplementuj przynajmniej 1 sposoby oceny jakości klasteryzacji (np. indeks
   Daviesa-Bouldina).

In [127]:
def centroids(clusters, d):
    """Calculates centroids of a given clusterization.

    Args:
        clusters (dict):  dict of clusters
        d (function: str, str -> float): distance function between strings
    
    Returns:
        dict: dict of calculated centroids
    """
    centroids = {}

    for cluster, docs in clusters.items():
        centroid, min_dist = None, float("inf")

        for text in docs:
            dist = sum([d(text, other) for other in docs])

            if dist < min_dist:
                centroid, min_dist = text, dist

        centroids[cluster] = centroid

    return centroids


def DB_index(clusters, d):
    """Calculates Davies-Bouldin index of a given clusterization.

    Args:
        clusters (dict): dict of clusters
        d (function: str, str -> float): distance function between strings

    Returns:
        float: DB index
    """
    C, sigma = centroids(clusters, d), {}

    for cluster, docs in clusters.items():
        sigma[cluster] = 0

        for text in docs:
            sigma[cluster] += d(C[cluster], text)

        sigma[cluster] /= len(docs)

    db_index = 0
    
    for i, cl_i in enumerate(clusters):
        max_ratio = 0
        for j, cl_j in enumerate(clusters):
            if i == j:
                continue

            max_ratio = max(
                max_ratio,
                (sigma[cl_i] + sigma[cl_j]) / d(C[cl_i], C[cl_j]),
            )

        db_index += max_ratio

    db_index /= len(clusters)

    return db_index


3. Stwórz stoplistę najczęściej występujących słów i zastosuj ją jako
   pre-processing dla nazw. Algorytmy klasteryzacji powinny działać na dwóch
   wariantach: z pre-processingiem i bez pre-processingu.

4. Wykonaj klasteryzację zawartości załączonego pliku (lines.txt) przy użyciu
   metryk zaimplementowanych w pkt. 1. Każda linia to adres pocztowy firmy,
   różne sposoby zapisu tego samego adresu powinny się znaleźć w jednym
   klastrze.

In [132]:
docs = open("lines.txt", "r").readlines()
preprocessed_docs = []
for text in docs:
    preprocessed_docs.append(preprocess(text))


5. Porównaj jakość wyników sposobami zaimplementowanymi w pkt. 2.

In [131]:
lines = open("clusters.txt", "r").read().splitlines()
lines = list(filter(lambda text: text != "", lines))

N = len(list(filter(lambda text: text == "##########", lines)))
model_clusters = {i: [] for i in range(N)}

i = 0
for text in lines:
    if text == "##########":
        if len(model_clusters[i]) < 5:
            del model_clusters[i]
        i += 1
    else:
        model_clusters[i].append(text)


d = partial(metric, type="levenshtein")
DB_index(model_clusters, d)


{0: '"SAME AS CONSIGNEE"', 1: '1.MCT', 5: 'TO THE ORDER', 22: 'PANTOS LOGISTICS CO.,LTD. O/B OF LGCHEM, LTD.', 31: 'AMETIST LLC TASHKENTSKAYA STREET 26-2 MOSCOW RUSSIA', 32: 'TO ORDER OF SHIPPER THAI UNION MANUFACTURING CO.,LTD. 979/13-16 M FLOOR, S.M.TOWER PHAHOLYOTHIN ROAD, SAMSENNAI, PHAYATHAI BANGKOK 10400 THAILAND', 34: 'ACERINOX POLSKA SP. Z O.O. DANISZEWSKA 23 03-230 WARSZAWA', 44: 'DAMCO POLAND 60 KWIATKOWSKIEGO 81-127 GDYNIA GDYNIA POLAND', 50: 'SINOTRANS ZHEJIANG YONGTONG COVDR8 CHANGCHUN RD2F315000 NINGBO CHINA', 60: '1.OOO SILMAR SPB,198035, RUSSIA , ST. PETERSBURG UL. DVINSKAYA 16/2, CONTACT PERSON:ALEXEY YASHIN TEL: +7812 495 8527', 62: 'TOKMANNI OY ISOLAMMINTIE 1 FI-04600MANTSALA  FINLAND', 66: 'DSV AIR&SEA SP.Z O.O. UL.J.WISNIEWSKIEGO 31 81-183 GDYNIA POLAND PHONE:+48(0)58 621 39 26 --', 67: 'HONOUR LANE SHIPPING LIMITED SHANGHAI BRANCH 12/F,ONE PRIME,NO.360 WUJIN ROAD, SHANGHAI,CHINA  POSTAL CODE: 200071 TEL: (86-21)60723272FAX', 69: '"TC"UNOTRANS"LTD 190020 ST.PETERSB

2.2032962654514963