In [1]:
import pandas as pd

data = pd.read_csv('cognates_dataset_clean.csv', encoding='cp1251')
data.shape

(618260, 6)

In [2]:
cognates_dict = dict()
for index, row in data.iterrows():
    curr_first, curr_second = row.first_word, row.second_word
    if curr_first not in cognates_dict:
        cognates_dict[curr_first] = [curr_second]
    else:
        cognates_dict[curr_first].append(curr_second)
    
    if curr_second not in cognates_dict:
        cognates_dict[curr_second] = [curr_first]
    else:
        cognates_dict[curr_second].append(curr_first)

In [3]:
print(cognates_dict['Гаагский'])

['Гаагец', 'Гаагец', 'Гаагцы', 'Гаагцы']


In [4]:
arr = [key for key in cognates_dict]
print(len(arr))

78263


In [5]:
from random import randint

n_count = 10
non_cognates = []

for word in cognates_dict:
    for i in range(n_count):
        index = randint(0, len(arr) - 1)
        random_word = arr[index]
        if random_word not in cognates_dict[word] and word != random_word and len(non_cognates) <= 618000:
            non_cognates.append([word, random_word])

In [6]:
def dist_Levenshtein(s1, s2):
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    for i in range(len(s1) + 1):
        for j in range(len(s2) + 1):
            if i == 0:
                d[i][j] = j
            elif j == 0:
                d[i][j] = i
            else:
                d[i][j] = min(d[i][j - 1] + 1, d[i - 1][j] + 1)
                delta = 0 if s1[i - 1] == s2[j - 1] else 1
                d[i][j] = min(d[i][j], d[i - 1][j - 1] + delta)
                
    return d[len(s1)][len(s2)]

In [7]:
def longest_common_substring(s1, s2):
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    max_val = 0
    for i in range(len(s1) + 1):
        for j in range(len(s2) + 1):
            if i and j and s1[i - 1] == s2[j - 1]:
                d[i][j] = d[i - 1][j - 1] + 1
            if d[i][j] > max_val:
                max_val = d[i][j]

    return max_val

In [8]:
def longest_common_subsequence(s1, s2):
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    max_val = 0
    for i in range(len(s1) + 1):
        for j in range(len(s2) + 1):
            if i and j and s1[i - 1] == s2[j - 1]:
                d[i][j] = d[i - 1][j - 1] + 1
            if i and j and s1[i - 1] != s2[j - 1]:
                d[i][j] = max(d[i - 1][j], d[i][j - 1])
            if d[i][j] > max_val:
                max_val = d[i][j]

    return max_val

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs
0,0,А-конто,Аканье,4,1,3
1,1,А-конто,Акать,4,1,3
2,2,А-конто,Акающий,6,1,2
3,3,А-конто,Ашка,5,1,2
4,4,А-конто,Бамовец,6,1,1


In [10]:
import numpy as np

array = np.array(non_cognates)
columns = ['first_word', 'second_word']

new_data = pd.DataFrame(array, columns=columns)
new_data.head()

Unnamed: 0,first_word,second_word
0,А-конто,Телеэкран
1,А-конто,Коллективизм
2,А-конто,Семерик
3,А-конто,Скрючить
4,А-конто,Эозойский


In [11]:
new_data['Levenshtein'] = new_data.apply(lambda row: dist_Levenshtein(row.first_word, row.second_word), axis=1)
new_data.head()

Unnamed: 0,first_word,second_word,Levenshtein
0,А-конто,Телеэкран,9
1,А-конто,Коллективизм,11
2,А-конто,Семерик,7
3,А-конто,Скрючить,7
4,А-конто,Эозойский,8


In [12]:
new_data['longest_common_substr'] = new_data.apply(lambda row: longest_common_substring(row.first_word, row.second_word), axis=1)
new_data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr
0,А-конто,Телеэкран,9,1
1,А-конто,Коллективизм,11,1
2,А-конто,Семерик,7,1
3,А-конто,Скрючить,7,1
4,А-конто,Эозойский,8,1


In [13]:
new_data['longest_common_subs'] = new_data.apply(lambda row: longest_common_subsequence(row.first_word, row.second_word), axis=1)
new_data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs
0,А-конто,Телеэкран,9,1,2
1,А-конто,Коллективизм,11,1,2
2,А-конто,Семерик,7,1,1
3,А-конто,Скрючить,7,1,2
4,А-конто,Эозойский,8,1,2


In [14]:
data.pop('Unnamed: 0')
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs
0,А-конто,Аканье,4,1,3
1,А-конто,Акать,4,1,3
2,А-конто,Акающий,6,1,2
3,А-конто,Ашка,5,1,2
4,А-конто,Бамовец,6,1,1


In [15]:
new_data.to_csv('non-cognates_dataset_bigger.csv', encoding='cp1251')

In [16]:
new_data.shape

(618001, 5)