# Ещё признаки

In [1]:
from nltk.stem import SnowballStemmer

snowball = SnowballStemmer(language="russian")

In [2]:
def LCS_string(s1, s2):
    s1 = s1.lower()
    s2 = s2.lower()
    s1 = snowball.stem(s1)
    s2 = snowball.stem(s2)
    
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    max_val, index_i = 0, 0
    for i in range(len(s1) + 1):
        for j in range(len(s2) + 1):
            if i and j and s1[i - 1] == s2[j - 1]:
                d[i][j] = d[i - 1][j - 1] + 1
            if d[i][j] > max_val:
                max_val = d[i][j]
                index_i = i
                index_j = j
    
    return s1[index_i - max_val : index_i]

In [3]:
print(LCS_string('subsequence', 'subeuencs'))

uenc


In [4]:
def LCS(s1, s2):
    return len(LCS_string(s1, s2))

In [5]:
def levenshtein_distance(s1, s2):
    s1 = s1.lower()
    s2 = s2.lower()
    s1 = snowball.stem(s1)
    s2 = snowball.stem(s2)
    
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    for i in range(len(s1) + 1):
        for j in range(len(s2) + 1):
            if i == 0:
                d[i][j] = j
            elif j == 0:
                d[i][j] = i
            else:
                d[i][j] = min(d[i][j - 1] + 1, d[i - 1][j] + 1)
                delta = 0 if s1[i - 1] == s2[j - 1] else 1
                d[i][j] = min(d[i][j], d[i - 1][j - 1] + delta)
                
    return d[len(s1)][len(s2)]

In [6]:
print(levenshtein_distance('Levenshtein', 'Lenvinsten'))

4


In [7]:
def levenshtein_similarity(s1, s2):
    dist = levenshtein_distance(s1, s2)
    if dist:
        return 1 / dist
    else:
        return 1

In [8]:
print(levenshtein_similarity('Levenshtein', 'Lenvinsten'))

0.25


In [9]:
def before_lcs(s1, s2):
    lcs_str = LCS_string(s1, s2)
    ind1 = s1.find(lcs_str)
    ind2 = s2.find(lcs_str)
    
    return abs(ind1 - ind2)

In [10]:
print(before_lcs('subsequence', 'subeuencs'))

2


In [11]:
def after_lcs(s1, s2):
    lcs_str = LCS_string(s1, s2)
    after1 = len(s1) - s1.find(lcs_str) - len(lcs_str)
    after2 = len(s2) - s2.find(lcs_str) - len(lcs_str)
    
    return abs(after1 - after2)

In [12]:
print(after_lcs('subsequence', 'subeuencs'))

0


In [13]:
import spacy

nlp = spacy.load('ru_core_news_lg')

In [14]:
def word_similarity(s1, s2):
    tokens = nlp(s1 + ' ' + s2)
    if tokens[0].has_vector and tokens[1].has_vector:
        return tokens[0].similarity(tokens[1])
    else:
        return None

In [15]:
print(word_similarity('мама', 'мамочка'))

0.7435692


Загрузим датасеты, объединим их, добавим признаки:

In [16]:
import pandas as pd
from sklearn.utils import shuffle

cognates_data = pd.read_csv('cognates_dataset_clean.csv', encoding='cp1251')
non_cognates_data = pd.read_csv('non-cognates_dataset_bigger.csv', encoding='cp1251')
non_cognates_data.head()

Unnamed: 0.1,Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs
0,0,А-конто,Телеэкран,9,1,2
1,1,А-конто,Коллективизм,11,1,2
2,2,А-конто,Семерик,7,1,1
3,3,А-конто,Скрючить,7,1,2
4,4,А-конто,Эозойский,8,1,2


In [17]:
cognates_data.pop('Unnamed: 0')
cognates_data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs
0,А-конто,Аканье,4,1,3
1,А-конто,Акать,4,1,3
2,А-конто,Акающий,6,1,2
3,А-конто,Ашка,5,1,2
4,А-конто,Бамовец,6,1,1


In [18]:
non_cognates_data.pop('Unnamed: 0')
non_cognates_data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs
0,А-конто,Телеэкран,9,1,2
1,А-конто,Коллективизм,11,1,2
2,А-конто,Семерик,7,1,1
3,А-конто,Скрючить,7,1,2
4,А-конто,Эозойский,8,1,2


In [19]:
cognates_data['class'] = 1
non_cognates_data['class'] = 0
non_cognates_data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class
0,А-конто,Телеэкран,9,1,2,0
1,А-конто,Коллективизм,11,1,2,0
2,А-конто,Семерик,7,1,1,0
3,А-конто,Скрючить,7,1,2,0
4,А-конто,Эозойский,8,1,2,0


In [20]:
data = pd.concat([cognates_data, non_cognates_data])
data = shuffle(data)
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class
0,Оргкомитет,Иеговистский,9,1,4,0
1,Скопировать,Гробокопатель,8,3,6,1
2,Керамзитовый,Несъёмный,9,2,4,0
3,Варьирование,Ксеноновый,10,2,2,0
4,Бывать,Диктовка,8,1,2,0


In [21]:
data = shuffle(data)
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class
1590,Кровососущий,Соплеменный,10,1,2,0
955209,Затюкать,Досушить,6,2,2,0
1062544,Метелить,Затечь,5,2,3,0
607664,Калевать,Натравляться,8,2,4,0
802046,Уваляться,Валивать,7,2,4,1


Добавим признаки:

In [22]:
data['Levenshtein_stemmed'] = data.apply(lambda row: levenshtein_distance(row.first_word, row.second_word), axis=1)
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class,Levenshtein_stemmed
1590,Кровососущий,Соплеменный,10,1,2,0,9
955209,Затюкать,Досушить,6,2,2,0,6
1062544,Метелить,Затечь,5,2,3,0,3
607664,Калевать,Натравляться,8,2,4,0,6
802046,Уваляться,Валивать,7,2,4,1,4


In [23]:
data['LCS_stemmed'] = data.apply(lambda row: LCS(row.first_word, row.second_word), axis=1)
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class,Levenshtein_stemmed,LCS_stemmed
1590,Кровососущий,Соплеменный,10,1,2,0,9,2
955209,Затюкать,Досушить,6,2,2,0,6,0
1062544,Метелить,Затечь,5,2,3,0,3,2
607664,Калевать,Натравляться,8,2,4,0,6,1
802046,Уваляться,Валивать,7,2,4,1,4,3


In [24]:
data['Levenshtein_similarity'] = data.apply(lambda row: levenshtein_similarity(row.first_word, row.second_word), axis=1)
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class,Levenshtein_stemmed,LCS_stemmed,Levenshtein_similarity
1590,Кровососущий,Соплеменный,10,1,2,0,9,2,0.111111
955209,Затюкать,Досушить,6,2,2,0,6,0,0.166667
1062544,Метелить,Затечь,5,2,3,0,3,2,0.333333
607664,Калевать,Натравляться,8,2,4,0,6,1,0.166667
802046,Уваляться,Валивать,7,2,4,1,4,3,0.25


In [25]:
data['before_lcs'] = data.apply(lambda row: before_lcs(row.first_word, row.second_word), axis=1)
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class,Levenshtein_stemmed,LCS_stemmed,Levenshtein_similarity,before_lcs
1590,Кровососущий,Соплеменный,10,1,2,0,9,2,0.111111,6
955209,Затюкать,Досушить,6,2,2,0,6,0,0.166667,0
1062544,Метелить,Затечь,5,2,3,0,3,2,0.333333,0
607664,Калевать,Натравляться,8,2,4,0,6,1,0.166667,0
802046,Уваляться,Валивать,7,2,4,1,4,3,0.25,2


In [26]:
data['after_lcs'] = data.apply(lambda row: after_lcs(row.first_word, row.second_word), axis=1)
data.head()

Unnamed: 0,first_word,second_word,Levenshtein,longest_common_substr,longest_common_subs,class,Levenshtein_stemmed,LCS_stemmed,Levenshtein_similarity,before_lcs,after_lcs
1590,Кровососущий,Соплеменный,10,1,2,0,9,2,0.111111,6,5
955209,Затюкать,Досушить,6,2,2,0,6,0,0.166667,0,0
1062544,Метелить,Затечь,5,2,3,0,3,2,0.333333,0,2
607664,Калевать,Натравляться,8,2,4,0,6,1,0.166667,0,4
802046,Уваляться,Валивать,7,2,4,1,4,3,0.25,2,1


In [27]:
data.to_csv('final_balanced_cognates_dataset.csv', encoding='cp1251')

# Построение модели

In [28]:
y = data['class']
X = data[['Levenshtein_stemmed', 'LCS_stemmed', 'Levenshtein_similarity', 'longest_common_subs', 'before_lcs', 'after_lcs']]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [29]:
X_train.head()

Unnamed: 0,Levenshtein_stemmed,LCS_stemmed,Levenshtein_similarity,longest_common_subs,before_lcs,after_lcs
74861,8,2,0.125,3,2,0
527053,14,5,0.071429,7,4,9
250125,3,6,0.333333,9,0,4
969827,12,1,0.083333,3,1,7
205706,5,7,0.2,7,4,3


In [30]:
y_train.head()

74861     0
527053    1
250125    1
969827    0
205706    1
Name: class, dtype: int64

In [31]:
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

model = LGBMClassifier()
model.fit(X_train, y_train)

LGBM_prediction = model.predict(X_test)
print(accuracy_score(LGBM_prediction, y_test))

0.9628801268336481


In [32]:
print(confusion_matrix(LGBM_prediction, y_test)) 

[[119429   4922]
 [  4256 118646]]


In [33]:
import pickle

pickle.dump(model, open('clf_light_gbm.sav', 'wb'))

In [34]:
data.shape

(1236261, 11)