In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./Source/train.csv', index_col='pair_id')
df.head()

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
4,Powermax Rubber Factory,Co. One,0
5,Tress A/S,Longyou Industries Park Zhejiang,0


In [3]:
df.is_duplicate.value_counts(normalize=True)

0    0.992652
1    0.007348
Name: is_duplicate, dtype: float64

#### Проверим сколько названий компаний в столбцах "name_1" и "name_2" содержат кириллические символы

In [4]:
a = ord('а') # кириллическая "А"
rus_alphabet = ''.join([chr(i) for i in range(a,a+6)] + [chr(a+33)] + [chr(i) for i in range(a+6,a+32)])

check = (df.name_1 + df.name_2).apply(lambda x: bool(set(rus_alphabet).intersection(set(x.lower()))))
check.value_counts(normalize=True)

False    0.99507
True     0.00493
dtype: float64

#### распределение целевой переменной при сравнении строк с кириллическими символами:

In [5]:
df[check].is_duplicate.value_counts(normalize=True)

0    0.939283
1    0.060717
Name: is_duplicate, dtype: float64

#### Создадим тестовый датафрейм

In [6]:
# np.random.seed(42)
# test_index = np.concatenate([
#     np.random.choice(df[(~check) & (df.is_duplicate == 0)].index, size=5, replace=False),
#     np.random.choice(df[(~check) & (df.is_duplicate == 1)].index, size=5, replace=False),
#     np.random.choice(df[(check) & (df.is_duplicate == 0)].index, size=5, replace=False),
#     np.random.choice(df[(check) & (df.is_duplicate == 1)].index, size=5, replace=False),
# ])
# test_df = df.loc[test_index]
# test_df

#### Предобработка

In [7]:
import re
import unicodedata
from cleanco import basename
from transliterate.decorators import transliterate_function

def preprocessing(x: str) -> str:
    
    def rus_preprocess(x: str) -> str:
        forms_of_ownership = [
            'ооо',
            'оао',
            'общество с ограниченной ответственностью',
            'открытое акционерное общество',
            'филиал компании'
        ]
        
        for form in forms_of_ownership:
            pattern = re.compile(form)
            x = pattern.sub('', x)
            
        return x
    
    @transliterate_function(language_code='ru', reversed=True)
    def translit(x: str) -> str:
        return x
    
    
    x = x.strip().lower()
    x = basename(x) if not bool(set(rus_alphabet).intersection(set(x.lower()))) else rus_preprocess(x)
    x = unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode() if not bool(set(rus_alphabet).intersection(set(x.lower()))) else translit(x)
    x = basename(x)
    x = re.sub(r'[^\w\s]',' ', x)
    
    return ' '.join([s for s in x.split()]) if len(x.split()) != 0 else '-' * 5
    
test_df = df.copy()
test_df.name_1 = test_df.name_1.apply(lambda x: preprocessing(x))
test_df.name_2 = test_df.name_2.apply(lambda x: preprocessing(x))
test_df

Unnamed: 0_level_0,name_1,name_2,is_duplicate
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,iko industries,enormous industrial trade,0
2,apcotex industries,technocraft industries india,0
3,rishichem distributors,dsa,0
4,powermax rubber factory,co one,0
5,tress,longyou industries park zhejiang,0
...,...,...,...
497815,bit mat products,the goodyear tire and rubber,0
497816,bnd trading,zhong shan yue liang economy trade imp exp,0
497817,xeikon industrial co ltd of dongguan city,yi cheng trading co ltd of dongguan city,0
497818,shanghai kechuan trading,shanghai m g stationery,0


#### Генерация доп. фичей

In [None]:
import difflib # встроенная библиотека

import abydos.distance as abd
# from thefuzz import fuzz as fuzz_ # даёт результат идентичный библиотеке fuzzywuzzy
from fuzzywuzzy import fuzz
from strsimpy.cosine import Cosine
import Levenshtein, distance, jellyfish, textdistance

import warnings

warnings.filterwarnings("ignore")


def tdc_n(seq1, seq2, n):
    cosine = Cosine(n)
    p1 = cosine.get_profile(seq1)
    p2 = cosine.get_profile(seq2)
    return cosine.similarity_profiles(p1, p2)


feature_df = test_df.copy()

In [10]:
# Levenshtein
feature_df.insert(2, 'l', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: Levenshtein.ratio(*x.split('|'))))

# Discounted_Levenshtein
feature_df.insert(3, 'dl', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.DiscountedLevenshtein().sim(*x.split('|'))))

# String Subsequence Kernel Similarity
feature_df.insert(4, 'ssk', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.SSK().sim(*x.split('|'))))

# Token Sort Ratio
feature_df.insert(5, 'ts', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.token_sort_ratio(*x.split('|')) / 100))

# Token Set Ratio
feature_df.insert(6, 'tsr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.token_set_ratio(*x.split('|')) / 100))

# Partial Ratio
feature_df.insert(7, 'pr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.partial_ratio(*x.split('|')) / 100))

# Weighted Ratio
feature_df.insert(8, 'wr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: fuzz.WRatio(*x.split('|')) / 100))

# Sequence Matcher Ratio
feature_df.insert(9, 'smr', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: difflib.SequenceMatcher(None, *x.split('|')).ratio()))

In [11]:
feature_df.to_csv('./Source/feature_1.csv')

In [12]:
# Sorensen Distance
feature_df.insert(10, 'sd', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: 1 - distance.sorensen(*x.split('|'))))

# Jaccard Distance
feature_df.insert(11, 'jd', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: 1 - distance.jaccard(*x.split('|'))))

# Jaro Similarity
feature_df.insert(12, 'js', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: jellyfish.jaro_similarity(*x.split('|'))))

# Jaro-Winkler Similarity
feature_df.insert(13, 'jws', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: jellyfish.jaro_winkler_similarity(*x.split('|'))))

# Match Rating Approach
feature_df.insert(14, 'mra', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: int(False if jellyfish.match_rating_comparison(*x.split('|')) is None else jellyfish.match_rating_comparison(*x.split('|')))))

# Text Distance Cosine
feature_df.insert(15, 'tdc', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: 1-textdistance.Cosine(qval=2).distance(*x.split('|'))))

# Text Distance Cosine n=1
feature_df.insert(16, 'tdc_1', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: tdc_n(*x.split('|'), 1)))

# Text Distance Cosine n=2
feature_df.insert(17, 'tdc_2', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: tdc_n(*x.split('|'), 2)))

In [13]:
feature_df.to_csv('./Source/feature_2.csv')

In [15]:
# Bag
feature_df.insert(18, 'bag', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.Bag().sim(*x.split('|'))))

# Monge-Elkan similarity
feature_df.insert(19, 'mes', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.MongeElkan(symmetric=False).sim(*x.split('|'))))

# Monge-Elkan symmetric similarity
feature_df.insert(20, 'mess', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.MongeElkan(symmetric=True).sim(*x.split('|'))))

# Rouge-W similarity
feature_df.insert(21, 'rws', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.RougeW().sim(*x.split('|'))))

# Positional Q-Gram Dice coefficient
feature_df.insert(22, 'pqgdc', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.PositionalQGramDice().sim(*x.split('|'))))

# Positional Q-Gram Jaccard coefficient
feature_df.insert(23, 'pqgjc', (test_df.name_1 + '|' + test_df.name_2).apply(lambda x: abd.PositionalQGramJaccard().sim(*x.split('|'))))

In [16]:
feature_df.to_csv('./Source/feature_2.csv')

In [17]:
feature_df.insert(24, 'mean', feature_df[feature_df.columns[2:-1]].mean(axis=1))
# feature_df.insert(26, 'weighted_mean', feature_df[feature_df.columns[2:-2]].apply(lambda row: weighted_mean(row, weights)))

In [18]:
feature_df.iloc[:, 2:].corr()

Unnamed: 0,l,dl,ssk,ts,tsr,pr,wr,smr,sd,jd,...,tdc_1,tdc_2,bag,mes,mess,rws,pqgdc,pqgjc,mean,is_duplicate
l,1.0,0.930228,0.894573,0.935335,0.88177,0.477329,0.624772,0.972621,0.824291,0.827531,...,0.808234,0.875484,0.874712,0.736587,0.905618,0.572279,0.415123,0.415167,0.955858,0.104937
dl,0.930228,1.0,0.822208,0.857937,0.807564,0.501617,0.601424,0.917259,0.708184,0.723301,...,0.713717,0.838443,0.769119,0.68052,0.836104,0.574796,0.542356,0.543181,0.914643,0.184686
ssk,0.894573,0.822208,1.0,0.863069,0.827824,0.44807,0.600033,0.859381,0.860262,0.867558,...,0.930352,0.866835,0.828948,0.753359,0.926523,0.479808,0.345636,0.343749,0.924015,0.096755
ts,0.935335,0.857937,0.863069,1.0,0.898533,0.424004,0.599309,0.911442,0.818799,0.820476,...,0.79924,0.849986,0.858845,0.718849,0.88381,0.542739,0.39322,0.394799,0.921872,0.107682
tsr,0.88177,0.807564,0.827824,0.898533,1.0,0.508408,0.745511,0.873158,0.758566,0.756003,...,0.753891,0.870161,0.737737,0.71956,0.883945,0.584962,0.344431,0.34478,0.90939,0.140547
pr,0.477329,0.501617,0.44807,0.424004,0.508408,1.0,0.808059,0.529649,0.240181,0.253259,...,0.375142,0.602777,0.135283,0.429395,0.527627,0.572233,0.224022,0.228989,0.563779,0.104987
wr,0.624772,0.601424,0.600033,0.599309,0.745511,0.808059,1.0,0.657695,0.455041,0.450822,...,0.541434,0.722423,0.319022,0.563514,0.692139,0.57217,0.236923,0.236848,0.70955,0.105384
smr,0.972621,0.917259,0.859381,0.911442,0.873158,0.529649,0.657695,1.0,0.768471,0.775855,...,0.758031,0.889361,0.81646,0.73362,0.889648,0.619276,0.417325,0.417384,0.943542,0.105378
sd,0.824291,0.708184,0.860262,0.818799,0.758566,0.240181,0.455041,0.768471,1.0,0.989923,...,0.871347,0.721154,0.867349,0.707122,0.87086,0.353611,0.278273,0.271447,0.840326,0.07193
jd,0.827531,0.723301,0.867558,0.820476,0.756003,0.253259,0.450822,0.775855,0.989923,1.0,...,0.853502,0.735957,0.870859,0.703775,0.866338,0.368603,0.297912,0.294024,0.847228,0.083018


In [19]:
feature_df.to_csv('./Source/feature_all.csv')