# Similarity metrics

## Longest common subsequence similarity

In [16]:
# Time complexity: O(M * N), space complexity O(min(M, N))
def lcs_sim(string1, string2):
    n, m = len(string1), len(string2)
    if n < m: return lcs_sim(string2, string1)
    dp = [[0] * (m + 1) for _ in range(2)]

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if string1[i - 1] == string2[j - 1]:
                dp[1][j] = dp[0][j - 1] + 1
            else:
                dp[1][j] = max(dp[0][j], dp[1][j - 1])

        dp[0], dp[1] = dp[1], dp[0]

    return 1 - dp[0][m] / n

## Text similarity using Levenshtein distance

In [17]:
# Time complexity: O(N * M), space complexity: O(N)
def levenshtein_sim(from_str, to_str):
    n, m = len(to_str), len(from_str)
    dp = [[0] * (n + 1) for _ in range(2)]

    for i in range(1, n + 1):
        dp[0][i] = i

    for i in range(1, m + 1):
        dp[1][0] = i
        for j in range(1, n + 1):
            if from_str[i - 1] == to_str[j - 1]:
                dp[1][j] = dp[0][j - 1]
            else:
                dp[1][j] = min(dp[0][j - 1], dp[1][j - 1], dp[0][j]) + 1

        dp[0], dp[1] = dp[1], dp[0]

    return dp[0][n] / max(n, m)

## Sørensen-Dice similarity

In [18]:
from collections import Counter

def dice_sim(string1, string2):
    n, m = len(string1), len(string2)
    intersection = Counter(string1) & Counter(string2)
    return 1 - 2 * sum(intersection.values()) / (n + m)

# Metric for evaluating clustering algorithms

## Davies–Bouldin index

In [19]:
def get_centroid(cluster, metric):
    n = len(cluster)
    dist = [0 for _ in range(n)]
    for i in range(n):
        for j in range(i + 1, n):
            d = metric(cluster[i], cluster[j])
            dist[i] += d
            dist[j] += d
    
    return cluster[dist.index(min(dist))]

def mean_dist(cluster, centroid, metric):
    dist = 0
    for line in cluster:
        dist += metric(line, centroid)
    
    return dist / len(cluster)
    
def davies_bouldin_index(clusters, metric):
    n = len(clusters)
    centroids = [get_centroid(cluster, metric) for cluster in clusters]
    means = [mean_dist(clusters[i], centroids[i], metric) for i in range(len(clusters))]
    
    D = [0 for _  in range(n)]
    for i in range(n):
        for j in range(i + 1, n):
            D[i] = max(D[i], (means[i] + means[j]) / metric(centroids[i], centroids[j]))
    
    return sum(D) / n

# Stoplist

In [83]:
def preprocess_data(text, reject_prc):
    n = len(text)
    all_words = [word for line in text for word in line.split()]
    counter = Counter(all_words)
    
    # reject 1 % of the most common words
    reject_num = int(n * (reject_prc / 100))
    to_reject = [word[0] for word in counter.most_common()[:reject_num]]
    return ["".join([word for word in text[i] if word not in to_reject]) for i in range(len(text))]

## Data preparation

In [21]:
with open("lines.txt", "r", encoding="UTF-8") as file:
    text = [line.rstrip() for line in file.readlines()]

# Clustering

In [109]:
from sklearn.cluster import DBSCAN
import numpy as np


def cluster(text, metric, eps=0.5, preprocess=True, reject_prc=3):
    if preprocess:
        text = preprocess_data(text, reject_prc)
    
    n = len(text)
    X = np.arange(n).reshape(-1, 1)
    clustering = DBSCAN(metric=lambda i, j: metric(text[int(i)], text[int(j)]), eps=eps, min_samples=1).fit_predict(X)
    
    clusters = {}
    for i in range(n):
        if clustering[i] not in clusters:
            clusters[clustering[i]] = []
        
        clusters[clustering[i]].append(text[i])
    
    return clusters

## Clustering evaluation

In [78]:
def evaluate_clusters(text, metric, eps, reject_prc=3, preprocess=True):
    clusters = list(cluster(text, metric, eps, preprocess, reject_prc).values())
    print(f"Number of lines: {len(text)}")
    print(f"Number of clusters: {len(clusters)}")
    print(f"Similarity accepatance rate: {eps}")
    metrics = {dice_sim: "DICE", lcs_sim: "LCS", levenshtein_sim: "Levenshtein"}
    print(f"Metric: {metrics[metric]}")
    print(f"Davies-Bouldin index {davies_bouldin_index(clusters, metric)}")
    print(f"Preprocessed: {preprocess}")

#### Sørensen-Dice similarity

In [101]:
print("-" * 50)
evaluate_clusters(text[:1000], dice_sim, 0.3, reject_prc=10)
print()
evaluate_clusters(text[:1000], dice_sim, 0.3, reject_prc=10, preprocess=False)
print("-" * 50)

--------------------------------------------------
Number of lines: 1000
Number of clusters: 36
Similarity accepatance rate: 0.3
Metric: DICE
Davies-Bouldin index 0.2746851632498913
Preprocessed: True

Number of lines: 1000
Number of clusters: 63
Similarity accepatance rate: 0.3
Metric: DICE
Davies-Bouldin index 0.30538354365368836
Preprocessed: False
--------------------------------------------------


In [129]:
print("-" * 50)
evaluate_clusters(text[:100], dice_sim, 0.3, reject_prc=15)
print()
evaluate_clusters(text[:100], dice_sim, 0.3, reject_prc=15, preprocess=False)
print("-" * 50)

--------------------------------------------------
Number of lines: 100
Number of clusters: 16
Similarity accepatance rate: 0.3
Metric: DICE
Davies-Bouldin index 0.10588835951162523
Preprocessed: True

Number of lines: 100
Number of clusters: 17
Similarity accepatance rate: 0.3
Metric: DICE
Davies-Bouldin index 0.10762148721155555
Preprocessed: False
--------------------------------------------------


#### LCS similarity

In [128]:
print("-" * 50)
evaluate_clusters(text[:100], lcs_sim, 0.65, reject_prc=12)
print()
evaluate_clusters(text[:100], lcs_sim, 0.65, reject_prc=12, preprocess=False)
print("-" * 50)

--------------------------------------------------
Number of lines: 100
Number of clusters: 9
Similarity accepatance rate: 0.65
Metric: LCS
Davies-Bouldin index 0.42042555070715076
Preprocessed: True

Number of lines: 100
Number of clusters: 10
Similarity accepatance rate: 0.65
Metric: LCS
Davies-Bouldin index 0.4015341515540848
Preprocessed: False
--------------------------------------------------


#### Text similarity using Levenshtein distance

In [125]:
print("-" * 50)
evaluate_clusters(text[:100], levenshtein_sim, 0.7, reject_prc=10)
print()
evaluate_clusters(text[:100], levenshtein_sim, 0.7, reject_prc=10, preprocess=False)
print("-" * 50)

--------------------------------------------------
Number of lines: 100
Number of clusters: 26
Similarity accepatance rate: 0.7
Metric: Levenshtein
Davies-Bouldin index 0.4713203212643998
Preprocessed: True

Number of lines: 100
Number of clusters: 26
Similarity accepatance rate: 0.7
Metric: Levenshtein
Davies-Bouldin index 0.4713203212643998
Preprocessed: False
--------------------------------------------------


For first 100 lines of the text Sørensen-Dice similarity has the lowest (the best) DB index, which could imply that it is the best metric, but the value of DB index highly depends on the parameter of acceptance rate, which will be different for every metric to get optimal results.

In [126]:
clusters = cluster(text[:100], levenshtein_sim, 0.7, preprocess=True, reject_prc=10).values()
for line in clusters:
    print(line)
    print("=" * 100)

['/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA']
["''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611", '"A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961', '"ENS" LTD ADDRESS: STAROPETROVSKIYPASSAGE, BLD 7A,  CONSTRUCTION 3 125130, MOSCOW, RUSSIA TEL: (499) 130-7336']
["''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669", "''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--", "''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND", '"KOBI-LIGHT" SPOLKA KOMANDYTOWA UL.T.BOYA ZELENSKIEGO 25 35-105 RZESZOW,POLAND NIP:813-34-99-669', '"KOBI-LIGHT" SPOLKA KOMANDYTOWA UL.T.BOYA ZELENSKIEGO 25 35-105 RZESZOW,POLAND NIP:813-34-99-699', '"SEVROLL-SYSTEM" SP.Z O.O. PLAC CZERWCA 1976 ROKU NR 1B  02-4

## Improvement ideas

1. Remove not only the most common words from the text, but also the most common words in English and punctuation marks. (stopwords from nltk library could be used)
2. Words could be stemmed by Porter Stemmer 2 to avoid cases such as "RUSSIA" and "RUSSIAN" being seperate words.
3. Change all letters to lowercase, which would improve the accuracy of metrics.