In [3]:
# !pip install python-Levenshtein

In [4]:
import numpy as np
import pandas as pd
from itertools import combinations
from Levenshtein import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

The problem is to match the user's free-form input against a pre-determined list of banks. For example, user input 'bawag bank' should be matched to 'BAWAG Group AG'.

In [5]:
# List of banks to compare
banks =   ['Sberbank Europe AG',
          'BAWAG Group AG',
          'Raiffeisenbankengruppe OÖ Verbund eGen',
          'Raiffeisen Bank International AG',
          'Volksbanken Verbund',
          'Erste Group Bank AG',
          'KBC Groep',
          'Investeringsmaatschappij Argenta',
          'Belfius Bank',
          'AXA Bank Belgium',
          'The Bank of New York Mellon SA/NV',
          'First Investment Bank AD',
          'RCB Bank Ltd',
          'Bank of Cyprus Holdings Public Limited Company',
          'Hellenic Bank Public Company Limited',
          'DekaBank Deutsche Girozentrale',
          'Erwerbsgesellschaft der S-Finanzgruppe mbH & Co. KG',
          'UBS Europe SE',
          'DEUTSCHE APOTHEKER- UND ÄRZTEBANK EG',
          'Volkswagen Bank Gesellschaft mit beschränkter Haftung',
          'Münchener Hypothekenbank eG',
          'DZ BANK AG Deutsche Zentral-Genossenschaftsbank, Frankfurt am Main',
          'HASPA Finanzholding',
          'State Street Europe Holdings Germany S.a.r.l. & Co. KG',
          'J.P. Morgan AG',
          'DEUTSCHE BANK AKTIENGESELLSCHAFT',
          'COMMERZBANK Aktiengesellschaft',
          'Landesbank Baden-Württemberg',
          'Landesbank Hessen-Thüringen Girozentrale',
          'Norddeutsche Landesbank - Girozentrale -',
          'Deutsche Pfandbriefbank AG',
          'Aareal Bank AG',
          'Hamburg Commercial Bank AG',
          'Bayerische Landesbank',
          'Jyske Bank A/S',
          'Sydbank A/S',
          'Nykredit Realkredit A/S',
          'Danske Bank A/S',
          'Luminor Holding AS',
          'Abanca Corporacion Bancaria S.A.',
          'Banco Santander S.A.',
          'Ibercaja Banco S.A.',
          'Kutxabank S.A',
          'Unicaja Banco S.A.',
          'CaixaBank S.A.',
          'Banco de Crédito Social Cooperativo',
          'Banco Bilbao Vizcaya Argentaria S.A.',
          'Banco de Sabadell S.A.',
          'Bankinter S.A.',
          'Kuntarahoitus Oyj',
          'Nordea Bank Abp',
          'OP Osuuskunta',
          'SFIL',
          'RCI Banque',
          'Confédération Nationale du Crédit Mutuel',
          'La Banque Postale',
          'Bpifrance',
          "C.R.H. - Caisse de refinancement de l'habitat",
          'HSBC Continental Europe',
          'Groupe BPCE',
          'Groupe Crédit Agricole',
          'Société générale',
          'BNP Paribas',
          'ALPHA SERVICES AND HOLDINGS S.A.',
          'National Bank of Greece S.A.',
          'Eurobank Ergasias Services and Holdings S.A.',
          'Piraeus Financial Holdings',
          'OTP-csoport',
          'Magyar Bankholding',
          'Barclays Bank Ireland plc',
          'Citibank Holdings Ireland Limited',
          'AIB Group plc',
          'Bank of Ireland Group plc',
          'Ulster Bank Ireland Designated Activity Company',
          'Bank of America Europe Designated Activity Company',
          'Íslandsbanki hf.',
          'Landsbankinn hf.',
          'Arion banki hf',
          'Intesa Sanpaolo S.p.A.',
          'Gruppo Bancario Finecobank  ',
          'UniCredit S.p.A.',
          'Gruppo Bancario Mediolanum  ',
          'Credito Emiliano Holding S.p.A.',
          'Banco BPM SpA',
          'Banca Popolare di Sondrio, Società Cooperativa per Azioni',
          'Banca Monte dei Paschi di Siena S.p.A.',
          'CASSA CENTRALE BANCA',
          'ICCREA BANCA S.P.A.',
          'Mediobanca - Banca di Credito Finanziario S.p.A.',
          'Akcine bendrove Šiauliu bankas',
          'Precision Capital S.A.',
          'RBC Investor Services Bank S.A.',
          'J.P. Morgan Bank Luxembourg S.A.',
          'Banque Internationale à Luxembourg',
          'Banque et Caisse d´Epargne de l´Etat, Luxembourg',
          'Akciju sabiedriba "Citadele banka"',
          'MDB Group Limited',
          'Bank of Valletta Plc',
          'HSBC Bank Malta p.l.c.',
          'BNG Bank N.V.',
          'ING Groep N.V.',
          'LP Group B.V.',
          'de Volksbank N.V.',
          'ABN AMRO Bank N.V.',
          'Coöperatieve Rabobank U.A.',
          'Nederlandse Waterschapsbank N.V.',
          'Bank Polska Kasa Opieki S.A.',
          'Powszechna Kasa Oszczednosci Bank Polski S.A.',
          'LSF Nani Investments S.à r.l.',
          'Banco Comercial Português SA',
          'Caixa Geral de Depósitos SA',
          'Banca Transilvania',
          'Länförsäkringar Bank AB (publ)',
          'Kommuninvest - group',
          'Skandinaviska Enskilda Banken - group',
          'SBAB Bank AB - group',
          'Swedbank - group',
          'Svenska Handelsbanken - group',
          'Biser Topco S.à r.l.',
          'Nova Ljubljanska Banka d.d. Ljubljana']

In [6]:
# Examples of search strings
s1 = 'Bawag bank' # other options: 'Bawag bank', 'Erste', 'Raiffaisen bank'


In [7]:
# A naive search method which you need to improve
from difflib import SequenceMatcher
res = []
for token in banks:
  res.append([s1, token, SequenceMatcher(None, s1, token).ratio()])

df2 = pd.DataFrame(res, columns=['Bank 1', 'Bank 2', 'Score'])
# The outcome is not great, for this search query 'BAWAG Group AG' should have highest similarity
df2.sort_values(by=['Score'], ascending=[False]).head()

Unnamed: 0,Bank 1,Bank 2,Score
8,Bawag bank,Belfius Bank,0.454545
12,Bawag bank,RCB Bank Ltd,0.454545
33,Bawag bank,Bayerische Landesbank,0.451613
42,Bawag bank,Kutxabank S.A,0.434783
99,Bawag bank,BNG Bank N.V.,0.434783


In [8]:
 #The desired combination has a low score
idx = df2['Bank 2'].isin(['BAWAG Group AG'])

df2[idx].sort_values(by=['Score'], ascending=[False]).head()

Unnamed: 0,Bank 1,Bank 2,Score
1,Bawag bank,BAWAG Group AG,0.166667


# **APPROACH 1: Levenshtein distance**


In [9]:
def match_banks_levenshtein(user_input, bank_list):
    """
    Function for Levenshtein similarity approach
    """

    def preprocess_text(text):
        """
        Function to preprocess text
        """
        return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())

    def levenshtein_similarity(s1, s2):
        """
        Function to calculate similarity score between two strings using Levenshtein distance
        """
        max_len = max(len(s1), len(s2))
        if max_len == 0:
            return 0
        return 1 - (distance(s1, s2) / max_len)

    user_input = preprocess_text(user_input)
    scores = []
    for bank in bank_list:
        bank_normalized = preprocess_text(bank)
        score = levenshtein_similarity(user_input, bank_normalized)
        scores.append((bank, score))
    result_df = pd.DataFrame(scores, columns=['Bank', 'Similarity_Score'])
    result_df = result_df.sort_values(by='Similarity_Score', ascending=False)

    return result_df

user_input = 'Bawag bank'
result_levenshtein = match_banks_levenshtein(user_input, banks)
print("Levenshtein Similarity:")
result_levenshtein.head()

Levenshtein Similarity:


Unnamed: 0,Bank,Similarity_Score
8,Belfius Bank,0.5
31,Aareal Bank AG,0.5
99,BNG Bank N.V.,0.454545
103,ABN AMRO Bank N.V.,0.4375
1,BAWAG Group AG,0.428571


As we can see that 'BAWAG Group AG' as better similarity score now using levenshtein distance approach but still it is not giving it the highest score. So now we're going to use cosine similarity.

# **APPROACH 2: Cosine Similarity**

In [10]:
def match_banks_cosine(user_input, bank_list):
    """
    Function for cosine similarity approach
    """

    def preprocess_text(text):
        """
        Function to preprocess text
        """
        return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())

    user_input = preprocess_text(user_input)
    bank_list = [preprocess_text(bank) for bank in bank_list]
    all_texts = [user_input] + bank_list
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    matches = list(zip(bank_list, cosine_similarities))
    result_df = pd.DataFrame(matches, columns=['Bank', 'Similarity_Score'])
    result_df = result_df.sort_values(by='Similarity_Score', ascending=False)
    return result_df

result_cosine = match_banks_cosine(user_input, banks)
print("\nCosine Similarity:")
result_cosine.head()


Cosine Similarity:


Unnamed: 0,Bank,Similarity_Score
1,bawag group ag,0.628456
8,belfius bank,0.177352
25,deutsche bank aktiengesellschaft,0.150588
31,aareal bank ag,0.150342
99,bng bank nv,0.144128


Here we can see that 'bawag group ag' has the highest similarity score.