In [1]:
import csv
import pandas as pd
import Levenshtein

In [2]:
def compute_distance(string1, string2):
    return Levenshtein.distance(string1, string2)

In [3]:
input_file = 'sp_csv.csv'
output_file = 'distances.csv'

In [4]:
df = pd.read_csv(input_file, encoding='utf-8')

In [5]:
df

Unnamed: 0,xml:id,count
0,AdnrejGrahWhatmough,1
1,AdnrejČernigoj,1
2,AdnrejŠircelj,1
3,AdolfVidenšek,1
4,AidanCerar,1
...,...,...
2680,ŽigaVrtačič,1
2681,ŽigaZaplotnik,4
2682,Žigon,1
2683,ŽivaBrecelj,1


In [6]:
distances = []
for i, row1 in df.iterrows():
    row_distances = []
    for _, row2 in df.iterrows():
        distance = compute_distance(row1['xml:id'], row2['xml:id'])
        row_distances.append(distance)
    distances.append(row_distances)


In [7]:
distances_df = pd.DataFrame(distances, columns=df['xml:id'], index=df['xml:id'])

In [8]:
distances_df

xml:id,AdnrejGrahWhatmough,AdnrejČernigoj,AdnrejŠircelj,AdolfVidenšek,AidanCerar,AjaVrenjak,AjdaCuderman,AjdaErjavecBartolj,AlajžKovačič,AlažKovačič,...,ŽeljkoKralj,ŽeljkoVrbos,ŽenjaLeiler,ŽigaNovak,ŽigaTurk,ŽigaVrtačič,ŽigaZaplotnik,Žigon,ŽivaBrecelj,ŽižaFelice
xml:id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AdnrejGrahWhatmough,0,12,13,17,15,16,15,15,16,17,...,17,17,16,17,17,17,17,18,18,18
AdnrejČernigoj,12,0,6,11,11,11,11,12,12,12,...,12,12,11,14,13,13,13,11,12,12
AdnrejŠircelj,13,6,0,11,10,10,11,11,12,12,...,10,12,10,13,12,13,13,12,9,10
AdolfVidenšek,17,11,11,0,10,8,10,15,12,11,...,12,12,11,12,11,12,12,11,12,12
AidanCerar,15,11,10,10,0,7,6,12,10,9,...,9,10,9,7,7,9,11,8,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ŽigaVrtačič,17,13,13,12,9,8,11,15,8,7,...,10,10,10,6,7,0,8,8,7,8
ŽigaZaplotnik,17,13,13,12,11,11,12,15,11,11,...,12,12,11,7,8,8,0,8,10,9
Žigon,18,11,12,11,8,9,11,17,11,10,...,9,9,9,5,5,8,8,0,9,8
ŽivaBrecelj,18,12,9,12,8,8,10,12,12,11,...,8,10,9,8,8,7,10,9,0,7


In [9]:
distances_df.to_csv(output_file, index=False)

In [10]:
print("Output file with distances has been created.")

Output file with distances has been created.


In [11]:
similar_pairs = []
for i in range(len(distances_df)):
    for j in range(len(distances_df.columns)):
        if 0 < distances_df.iloc[i, j] <= 2:
            similar_pairs.append((distances_df.index[i], distances_df.index[j]))
print("Similar pairs, distance max 2")
for pair in similar_pairs:
    print(pair)
            

Similar pairs, distance max 2
('AdnrejGrahWhatmough', 'AndrejGrahWhatmough')
('AdnrejČernigoj', 'AndarejČernigoj')
('AdnrejČernigoj', 'AndrejČernigoj')
('AdnrejŠircelj', 'AndejŠircelj')
('AdnrejŠircelj', 'AndnrejŠircelj')
('AdnrejŠircelj', 'AndrejŠircelj')
('AdnrejŠircelj', 'AnrejŠircelj')
('AlajžKovačič', 'AlažKovačič')
('AlajžKovačič', 'AljažKovačič')
('AlažKovačič', 'AlajžKovačič')
('AlažKovačič', 'AlešKovačič')
('AlažKovačič', 'AljašKovačič')
('AlažKovačič', 'AljažKovačiič')
('AlažKovačič', 'AljažKovačič')
('AleksandaraPivec', 'AleksandraPivec')
('AleksanderReberšek', 'AlenksanderReberšek')
('AleksanderReberšek', 'AlenskanderReberšek')
('AleksandraPivec', 'AleksandaraPivec')
('AlenPečarič', 'ElenaPečarič')
('AlenjaJeraj', 'AlenkaJeraj')
('AlenkaBrantušek', 'AlenkaBratušek')
('AlenkaBrantušek', 'AlenkaBrautšek')
('AlenkaBrantušek', 'AlenkraBratušek')
('AlenkaBratušek', 'AlenkaBrantušek')
('AlenkaBratušek', 'AlenkaBrautšek')
('AlenkaBratušek', 'AlenkraBratušek')
('AlenkaBrautšek', 'A

In [12]:
similar_groups = {}

# Iterate over each pair of similar strings
for pair in similar_pairs:
    # Add the strings to the corresponding groups in the dictionary
    similar_groups.setdefault(pair[0], []).append(pair[1])

# Print the groups of similar strings
print("Groups of similar strings with a distance of max 2:")
for key, value in similar_groups.items():
    print(f"{key}: {value}")

Groups of similar strings with a distance of max 2:
AdnrejGrahWhatmough: ['AndrejGrahWhatmough']
AdnrejČernigoj: ['AndarejČernigoj', 'AndrejČernigoj']
AdnrejŠircelj: ['AndejŠircelj', 'AndnrejŠircelj', 'AndrejŠircelj', 'AnrejŠircelj']
AlajžKovačič: ['AlažKovačič', 'AljažKovačič']
AlažKovačič: ['AlajžKovačič', 'AlešKovačič', 'AljašKovačič', 'AljažKovačiič', 'AljažKovačič']
AleksandaraPivec: ['AleksandraPivec']
AleksanderReberšek: ['AlenksanderReberšek', 'AlenskanderReberšek']
AleksandraPivec: ['AleksandaraPivec']
AlenPečarič: ['ElenaPečarič']
AlenjaJeraj: ['AlenkaJeraj']
AlenkaBrantušek: ['AlenkaBratušek', 'AlenkaBrautšek', 'AlenkraBratušek']
AlenkaBratušek: ['AlenkaBrantušek', 'AlenkaBrautšek', 'AlenkraBratušek']
AlenkaBrautšek: ['AlenkaBrantušek', 'AlenkaBratušek']
AlenkaJeraj: ['AlenjaJeraj', 'AlenkaJerja']
AlenkaJerja: ['AlenkaJeraj']
AlenkaKlepac: ['AlenkaKlepec']
AlenkaKlepec: ['AlenkaKlepac']
AlenkraBratušek: ['AlenkaBrantušek', 'AlenkaBratušek']
AlenksanderReberšek: ['AleksanderR

In [13]:
similar_groups_file = 'similar_groups.csv'

# Flatten the similar_groups dictionary
flattened_groups = []
for key, value in similar_groups.items():
    flattened_groups.append([key] + value)

# Write flattened_groups to CSV
with open(similar_groups_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Unique String', 'Similar Strings'])
    for group in flattened_groups:
        writer.writerow(group)