<a href="https://colab.research.google.com/github/Falconwatch/cybersec_ht/blob/main/Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q fuzzywuzzy
!pip install -q python-Levenshtein

import lightgbm as lgbm
import joblib
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import fuzz
import rapidfuzz

In [4]:
examples = pd.read_excel("examples.xlsx")
cust_1 = pd.read_excel("cust_1.xlsx")
cust_2 = pd.read_excel("cust_2.xlsx")

In [39]:
class HotelComparator():
  def __init__(self, path):
    """
    param path: Путь до сохранённой модели
    """
    self.__clf = joblib.load('hotel_clf.pkl')
    self.__feature_fields = ['hotel_name', 'city_name', 'hotel_address', 'star_rating', 'postal_code']
    self.__th = 0.0002

  def similarity_ratio_func(self, df, field):
    def string_similarity(s1,s2):
      return rapidfuzz.fuzz_cpp.WRatio(s1,s2)/100

    def postalcode_similarity(s1, s2):
      longest_string = max([len(s1), len(s2)])
      s1 = s1+"0"*(longest_string - len(s1))
      s2 = s2+"0"*(longest_string - len(s2))
      t = [a==b for a,b in zip(s1, s2)][::-1]
      max_possible_similarity = sum([10**i for i in range(longest_string)])
      current_similarity = sum([t[i]*10**i for i in range(len(t))])
      difference = max_possible_similarity - current_similarity
      difference_ratio = difference/max_possible_similarity
      return 1-difference_ratio
    
    def rating_star_diff(s1, s2):
      try:
        r = abs(float(s1)- float(s2))
        return r
      except:
        return None

    def universal_wrapper(x1, x2, func):
      if pd.isna(x1) or pd.isna(x2):
        return None
      x1=str(x1).upper()
      x2=str(x2).upper()
      return func(x1,x2)

    name1 = "c1." + field
    name2 = "c2." + field
    name3 = field + "_similarity"
    
    if "postal_code" in field:
      df[name3] = df[[name1, name2]].apply(lambda x: universal_wrapper(x[name1], x[name2], postalcode_similarity),
                                            axis=1)
    elif "star_rating" in field:
      df[name3] = df[[name1, name2]].apply(lambda x: universal_wrapper(x[name1], x[name2], rating_star_diff),
                                            axis=1)
    else:
      df[name3] = df[[name1, name2]].apply(lambda x: universal_wrapper(x[name1], x[name2], string_similarity),
                                            axis=1)

  def generate_features(self, df):
    for ff in tqdm(self.__feature_fields):
      self.similarity_ratio_func(df, ff)


  def compare_hotels_lists(self, hotels_1, hotels_2,
                           country_code_name_1 = "c1.country_code",
                           country_code_name_2 = "c2.country_code",):
    """
    param hotels_1: pandas dataframe со списком отелей от поставщика 1
    param hotels_2: pandas dataframe со списком отелей от поставщика 2
    """
    hotels_1 = hotels_1.rename({country_code_name_1:"country_code"}, axis=1)
    hotels_2 = hotels_2.rename({country_code_name_2:"country_code"}, axis=1)
    countries = hotels_1["country_code"].unique()

    country_datas = list()
    for country in countries:
      h1 = hotels_1[hotels_1["country_code"] == country]
      h2 = hotels_2[hotels_2["country_code"] == country]
      total = h1.reset_index(drop=True).merge(h2.reset_index(drop=True),
                                              on =["country_code"])
      country_datas.append(total)

    features = [f+"_similarity" for f in self.__feature_fields]
    full_data = pd.concat(country_datas)
    self.generate_features(full_data)
    probas = self.__clf.predict_proba(full_data[features])[:,1]
    predictions = (probas > self.__th).astype(int)

    full_data["match"] = predictions
    result = full_data.loc[full_data["match"]==1, ["c1.key", "c2.key"]]

    return result


In [40]:
hc = HotelComparator("hotel_clf.pkl")
result = hc.compare_hotels_lists(cust_1, cust_2)

In [70]:
result = hc.compare_hotels_lists(cust_1.sample(1000), cust_2.sample(1000))

100%|██████████| 5/5 [00:04<00:00,  1.00it/s]


In [71]:
result

Unnamed: 0,c1.key,c2.key
225,21E3E54802061954C7A13B48FC3BCC17,996625DC5CDFB456A1C7F33A66828BE6
568,7FDFC0A41C9E9DF8E88B8DAD7F1F16A0,31013D3FF35365DAA07797D8230F0952
584,73A713296F84E07912E470FA96739385,54F918FAAAD3C2DE3B4C8E11A5903FB0
653,73A713296F84E07912E470FA96739385,792B310EEB756D66B139C9E20C8C4527
1143,F7ABA086E6FD21A4466AF133DFF08C59,B6BC422115F3A1FD115C95C1EAED0458
...,...,...
19,5D41E44DDB171C4FC9E847892FE06FF8,55CE1EDD39055403D407AD809B105EFB
54,68FBC3913DD1843AF9A80C15E9E3236F,242A8C36A678C95894E2BD8C2522D1D7
1,1F68BEDEBCE214885D91AD552939B86E,EBBE808BD77157D6042B6669BA112923
2,6AA7A1B93F035EE2F49E0892EEC67097,4B701D316760D1BA3455558315FAC449
