In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./Source/train.csv', index_col='pair_id')
df.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0


In [3]:
df.is_duplicate.value_counts(normalize=True)

0    0.992652
1    0.007348
Name: is_duplicate, dtype: float64

#### Проверим сколько названий компаний в столбцах "name_1" и "name_2" содержат кириллические символы

In [4]:
a = ord('а') # кирилическая "А"
rus_alphabet = ''.join([chr(i) for i in range(a,a+6)] + [chr(a+33)] + [chr(i) for i in range(a+6,a+32)])

check = (df.name_1 + df.name_2).apply(lambda x: bool(set(rus_alphabet).intersection(set(x.lower()))))
check.value_counts(normalize=True)

False    0.99507
True     0.00493
dtype: float64

#### распределение целевой переменной при сравнении строк с кириллическими символами:

In [5]:
df[check].is_duplicate.value_counts(normalize=True)

0    0.939283
1    0.060717
Name: is_duplicate, dtype: float64

#### Создадим тестовый датафрейм

In [6]:
np.random.seed(42)
test_index = np.concatenate([
    np.random.choice(df[(~check) & (df.is_duplicate == 0)].index, size=5, replace=False),
    np.random.choice(df[(~check) & (df.is_duplicate == 1)].index, size=5, replace=False),
    np.random.choice(df[(check) & (df.is_duplicate == 0)].index, size=5, replace=False),
    np.random.choice(df[(check) & (df.is_duplicate == 1)].index, size=5, replace=False),
])
test_df = df.loc[test_index]
test_df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
218087,W.A. International,Rhino Sport & Lesiure,0
493634,Samurai International,Mrf Ltd.,0
250271,Afriseek Trading Co.,Americas Trading Inc.,0
275108,Us Trade Logistics,Stp Ltd.,0
342036,Colas Polska Sp. z o.o.,VARO ENERGY,0
18571,Beijing Oriental Yuhong Waterproof Technology ...,Oriental Yuhong,1
421535,Sika Corporation,Sika (China) Ltd.,1
68133,SOPREMA CASTELLBISBAL,soprema Soprema sas,1
255790,Bridgestone Tire Co.,Pt Bridgestone Tire Indonesia,1
326044,Bridgestone De Costa Rica Sociedad Anoni,Bridgestone Firestone Do Brasil,1


#### Предобработка

In [7]:
import re
import unicodedata
from cleanco import basename
from transliterate.decorators import transliterate_function

def preprocessing(x: str) -> str:
    
    def rus_preprocess(x: str) -> str:
        forms_of_ownership = [
            'ооо',
            'оао',
            'общество с ограниченной ответственностью',
            'открытое акционерное общество',
            'филиал компании'
        ]
        
        for form in forms_of_ownership:
            pattern = re.compile(form)
            x = pattern.sub('', x)
            
        return x
    
    @transliterate_function(language_code='ru', reversed=True)
    def translit(x: str) -> str:
        return x
    
    
    x = x.strip().lower()
    x = basename(x) if not bool(set(rus_alphabet).intersection(set(x.lower()))) else rus_preprocess(x)
    x = unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode() if not bool(set(rus_alphabet).intersection(set(x.lower()))) else translit(x)
    x = basename(x)
    x = re.sub(r'[^\w\s]',' ', x)
    
    return ' '.join([s for s in x.split() if len(s) > 1])
    

test_df.name_1 = test_df.name_1.apply(lambda x: preprocessing(x))
test_df.name_2 = test_df.name_2.apply(lambda x: preprocessing(x))
test_df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
218087,international,rhino sport lesiure,0
493634,samurai international,mrf,0
250271,afriseek trading,americas trading,0
275108,us trade logistics,stp,0
342036,colas polska,varo energy,0
18571,beijing oriental yuhong waterproof technology ...,oriental yuhong,1
421535,sika,sika china,1
68133,soprema castellbisbal,soprema soprema,1
255790,bridgestone tire,pt bridgestone tire indonesia,1
326044,bridgestone de costa rica sociedad anoni,bridgestone firestone do brasil,1


#### Генерация доп. фичей

In [47]:
import difflib # встроенная библиотека

import abydos.distance as abd
# from thefuzz import fuzz as fuzz_ # даёт результат идентичен библиотеке fuzzywuzzy
from fuzzywuzzy import fuzz
from strsimpy.cosine import Cosine
import Levenshtein, distance, jellyfish, textdistance


import warnings

warnings.filterwarnings("ignore")


def tdc_n(seq1, seq2, n):
    cosine = Cosine(n)
    p1 = cosine.get_profile(seq1)
    p2 = cosine.get_profile(seq2)
    return cosine.similarity_profiles(p1, p2)


feature_df = test_df.copy()

# Levenshtein
feature_df.insert(2, 'l', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: Levenshtein.ratio(*x.split('|'))))

# Discounted_Levenshtein
feature_df.insert(3, 'dl', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.DiscountedLevenshtein().sim(*x.split('|'))))

# String Subsequence Kernel Similarity
feature_df.insert(4, 'ssk', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.SSK().sim(*x.split('|'))))

# Token Sort Ratio
feature_df.insert(5, 'ts', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.token_sort_ratio(*x.split('|')) / 100))

# Token Set Ratio
feature_df.insert(6, 'tsr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.token_set_ratio(*x.split('|')) / 100))

# Partial Ratio
feature_df.insert(7, 'pr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.partial_ratio(*x.split('|')) / 100))

# Weighted Ratio
feature_df.insert(8, 'wr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.WRatio(*x.split('|')) / 100))

# Sequence Matcher Ratio
feature_df.insert(9, 'smr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: difflib.SequenceMatcher(None, *x.split('|')).ratio()))

# Sorensen Distance
feature_df.insert(10, 'sd', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: 1 - distance.sorensen(*x.split('|'))))

# Jaccard Distance
feature_df.insert(11, 'jd', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: 1 - distance.jaccard(*x.split('|'))))

# Jaro Similarity
feature_df.insert(12, 'js', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: jellyfish.jaro_similarity(*x.split('|'))))

# Jaro-Winkler Similarity
feature_df.insert(13, 'jws', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: jellyfish.jaro_winkler_similarity(*x.split('|'))))

# Match Rating Approach
feature_df.insert(14, 'mra', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: int(False if jellyfish.match_rating_comparison(*x.split('|')) is None else jellyfish.match_rating_comparison(*x.split('|')))))

# Text Distance Cosine
feature_df.insert(15, 'tdc', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: 1-textdistance.Cosine(qval=2).distance(*x.split('|'))))

# Text Distance Cosine n=1
feature_df.insert(16, 'tdc_1', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: tdc_n(*x.split('|'), 1)))

# Text Distance Cosine n=2
feature_df.insert(17, 'tdc_2', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: tdc_n(*x.split('|'), 2)))

# Text Distance Cosine n=3
feature_df.insert(18, 'tdc_3', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: tdc_n(*x.split('|'), 3)))

# Bag
feature_df.insert(19, 'bag', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.Bag().sim(*x.split('|'))))

# Monge-Elkan similarity
feature_df.insert(20, 'mes', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.MongeElkan(symmetric=False).sim(*x.split('|'))))

# Monge-Elkan symmetric similarity
feature_df.insert(21, 'mess', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.MongeElkan(symmetric=True).sim(*x.split('|'))))

# Rouge-W similarity
feature_df.insert(22, 'rws', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.RougeW().sim(*x.split('|'))))

# Positional Q-Gram Dice coefficient
feature_df.insert(23, 'pqgdc', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.PositionalQGramDice().sim(*x.split('|'))))

# Positional Q-Gram Jaccard coefficient
feature_df.insert(24, 'pqgjc', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.PositionalQGramJaccard().sim(*x.split('|'))))


feature_df.insert(25, 'mean', feature_df[feature_df.columns[2:-2]].mean(axis=1))

In [48]:
feature_df.iloc[:, 2:]

Unnamed: 0_level_0,l,dl,ssk,ts,tsr,pr,wr,smr,sd,jd,...,tdc_2,tdc_3,bag,mes,mess,rws,pqgdc,pqgjc,mean,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
218087,0.3125,0.206238,0.239913,0.31,0.31,0.38,0.31,0.3125,0.7,0.538462,...,0.062994,0.0,0.421053,0.535714,0.505357,0.264591,0.0,0.0,0.319891,0
493634,0.166667,0.141625,0.050768,0.17,0.17,0.67,0.6,0.166667,0.266667,0.153846,...,0.0,0.0,0.095238,0.181818,0.340909,0.068243,0.0,0.0,0.212022,0
250271,0.75,0.575018,0.677916,0.75,0.75,0.75,0.75,0.75,0.833333,0.714286,...,0.533333,0.428571,0.8125,0.794118,0.794118,0.522913,0.588235,0.416667,0.716182,0
275108,0.190476,0.163754,0.127054,0.19,0.19,0.67,0.6,0.190476,0.25,0.142857,...,0.171499,0.0,0.111111,0.289474,0.457237,0.148895,0.0,0.0,0.262571,0
342036,0.173913,0.105605,0.039308,0.17,0.17,0.19,0.17,0.173913,0.352941,0.214286,...,0.0,0.0,0.25,0.423077,0.357372,0.118002,0.0,0.0,0.181365,0
18571,0.379747,0.276001,0.577591,0.38,1.0,1.0,0.9,0.379747,0.666667,0.5,...,0.482617,0.450694,0.234375,0.561538,0.749519,0.245456,0.0,0.0,0.512985,1
421535,0.571429,0.506313,0.57908,0.57,1.0,1.0,0.9,0.571429,0.666667,0.5,...,0.57735,0.5,0.4,1.0,0.818182,0.772737,0.5,0.333333,0.658742,1
68133,0.611111,0.58516,0.480096,0.61,1.0,0.67,0.95,0.5,0.761905,0.615385,...,0.570088,0.526201,0.52381,0.613636,0.775568,0.403017,0.421053,0.266667,0.649938,1
255790,0.711111,0.573719,0.819281,0.71,1.0,1.0,0.9,0.711111,0.916667,0.846154,...,0.81763,0.744438,0.551724,0.941176,0.878922,0.927981,0.0,0.0,0.734933,1
326044,0.591549,0.539051,0.72108,0.59,0.59,0.65,0.59,0.478873,0.888889,0.8,...,0.55995,0.415619,0.675,0.707317,0.759909,0.312329,0.356164,0.216667,0.644075,1


In [49]:
feature_df.iloc[:, 2:].corr()

Unnamed: 0,l,dl,ssk,ts,tsr,pr,wr,smr,sd,jd,...,tdc_2,tdc_3,bag,mes,mess,rws,pqgdc,pqgjc,mean,is_duplicate
l,1.0,0.977139,0.933205,0.995841,0.806232,0.66259,0.716321,0.979384,0.835538,0.86393,...,0.930182,0.92367,0.871097,0.890509,0.908053,0.851741,0.859946,0.857111,0.976413,0.724399
dl,0.977139,1.0,0.901221,0.973315,0.779123,0.653133,0.723466,0.94513,0.799762,0.826391,...,0.915804,0.904072,0.84169,0.856795,0.898064,0.815351,0.878766,0.875228,0.962735,0.749344
ssk,0.933205,0.901221,1.0,0.933525,0.841248,0.768848,0.758303,0.911397,0.904147,0.920042,...,0.95042,0.916431,0.790598,0.900934,0.964708,0.787744,0.742722,0.730997,0.970078,0.801551
ts,0.995841,0.973315,0.933525,1.0,0.80382,0.653149,0.707699,0.975998,0.850679,0.876781,...,0.928071,0.919613,0.878408,0.889184,0.910304,0.846158,0.857482,0.85606,0.975425,0.717384
tsr,0.806232,0.779123,0.841248,0.80382,1.0,0.797248,0.865867,0.785241,0.732143,0.732544,...,0.88504,0.90797,0.551954,0.831765,0.90877,0.795598,0.600844,0.587985,0.867207,0.802968
pr,0.66259,0.653133,0.768848,0.653149,0.797248,1.0,0.950116,0.679506,0.525166,0.540894,...,0.839598,0.808209,0.309892,0.66198,0.820782,0.673635,0.544601,0.536468,0.776678,0.765648
wr,0.716321,0.723466,0.758303,0.707699,0.865867,0.950116,1.0,0.704555,0.549691,0.56476,...,0.866231,0.85168,0.370426,0.649349,0.846386,0.678347,0.613853,0.598607,0.815662,0.787349
smr,0.979384,0.94513,0.911397,0.975998,0.785241,0.679506,0.704555,1.0,0.786922,0.814372,...,0.91884,0.91484,0.863452,0.897366,0.883561,0.889294,0.843135,0.855206,0.954547,0.654565
sd,0.835538,0.799762,0.904147,0.850679,0.732143,0.525166,0.549691,0.786922,1.0,0.991576,...,0.788564,0.753457,0.824717,0.834922,0.879019,0.653667,0.618144,0.596934,0.858875,0.663315
jd,0.86393,0.826391,0.920042,0.876781,0.732544,0.540894,0.56476,0.814372,0.991576,1.0,...,0.818155,0.788484,0.840768,0.839091,0.883167,0.693943,0.6249,0.608876,0.877538,0.668911
