In [1]:
import numpy as np
import pandas as pd 
import requests
import pickle
import sys
import tensorflow as tf
import tensorflow_hub as hub

from bs4 import BeautifulSoup
from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance, euclidean_distance

sys.setrecursionlimit(6000)

In [None]:
starting_urls = ['https://support.n26.com/de-de',
                 'https://support.n26.com/de-at',
                 'https://support.n26.com/en-at',
                 'https://support.n26.com/en-de',
                 'https://support.n26.com/en-it',
                 'https://support.n26.com/it-it',
                 'https://support.n26.com/en-eu',
                 'https://support.n26.com/en-fr',
                 'https://support.n26.com/fr-fr',
                 'https://support.n26.com/en-es',
                 'https://support.n26.com/es-es',
                 'https://support.n26.com/en-us',
                 'https://support.n26.com/en-gb'
                ]

# Scraping the FAQ data from the helpdesk of N26 across all markets (40%)


### Crawl (20%)

In [None]:
def crawl_website(input_urls, root= 'https://support.n26.com', **kwargs):
    # Crawls a website from multiple starting positions (could be the same)
    # and retrieves all urls that start with the root field
    
    #Parameters:
    #    input_urls (list): A list of urls to start the crawl.
    #    root       (str) : We will only keep urls that start with this root

    #Returns:
    #    output_urls(list[(url_1,market_1),(url_2,market_2),..]): All applicable 
    #    urls and markets .      
    
    # get html info with BS4
    out_urls = []
    
    # we don't need to get the urls from 'https://n26.com/en-gb', since the Q/A are already on that page
    for url in input_urls[:-1]:
        req = requests.get(url).content
        soup = BeautifulSoup(req, 'html.parser')

        # find elements and filter to get only the ones needed 
        all_urls = [root + element.get('href')
                    for element in soup.find_all('a')
                    if not element.get('href').startswith('http')
                    and len(element.get('href')) > 7]
        
        print(url, len(all_urls))

        for urls in all_urls:
            req = requests.get(urls).content
            soup = BeautifulSoup(req, 'html.parser')
            # find elements and filter to get only the ones needed 
            out_urls += [root + element.get('href')
                         for element in soup.find_all('a')
                         if element is not None 
                         and len(element.get('href')) > 7
                         and not element.get('href').startswith('http')]

    output_urls = [(url, url[24:29]) for url in set(out_urls)]
    
    output_urls += [(input_urls[-1], input_urls[-5:])]
    
    return output_urls


urls_per_market = crawl_website(starting_urls)

https://support.n26.com/de-de 35
https://support.n26.com/de-at 35
https://support.n26.com/en-at 35
https://support.n26.com/en-de 35
https://support.n26.com/en-it 32
https://support.n26.com/it-it 32
https://support.n26.com/en-eu 32
https://support.n26.com/en-fr 32
https://support.n26.com/fr-fr 32
https://support.n26.com/en-es 31
https://support.n26.com/es-es 31
https://support.n26.com/en-us 24


In [None]:
urls_df = pd.DataFrame(urls_per_market, columns=['url', 'market'])

urls_df.to_csv('../data/urls_per_market_n26_felipe.csv')

# If stuck, use the below csv 
# urls_per_market = pd.read_csv('../data/urls_per_market_n26.csv').values.tolist()

In [None]:
len(urls_per_market)

1723

In [None]:
urls_per_market = pd.read_csv('../data/urls_per_market_n26_felipe.csv', index_col=0).values.tolist()

### Scrape (10%)

In [None]:
def scrape_urls(urls_and_markets, **kwargs):
    # Scrapes urls
    
    #Parameters:
    #    urls_and_markets (list[(url_1,market_1),(url_2,market_2),..]): A list 
    #    of urls to scrape alongside their markets.
    
    #Returns:
    #    parsed_data (list[(url_1,market_1,html_1),(url_2,market_2,html_2),..]):
    #    All applicable urls, markets and scraped content
    
    parsed_data = []
    errors = []
    c = len(urls_and_markets)
    
    for url, market in urls_and_markets:
        c -= 1
        
        try:
            req = requests.get(url).content
            parsed_data += [(url, market, BeautifulSoup(req, 'html.parser'))]
        except:
            errors += [url]
            print('error')
            
        print(c)
        
    return parsed_data

    
soups_parsed = scrape_urls(urls_per_market)

1722
1721
1720
1719
1718
1717
1716
1715
1714
1713
1712
1711
1710
1709
1708
1707
1706
1705
1704
1703
1702
1701
1700
1699
1698
1697
1696
1695
1694
1693
1692
1691
1690
1689
1688
1687
1686
1685
1684
1683
1682
1681
1680
1679
1678
1677
1676
1675
1674
1673
1672
1671
1670
1669
1668
1667
1666
1665
1664
1663
1662
1661
1660
1659
1658
1657
1656
1655
1654
1653
1652
1651
1650
1649
1648
1647
1646
1645
1644
1643
1642
1641
1640
1639
1638
1637
1636
1635
1634
1633
1632
1631
1630
1629
1628
1627
1626
1625
1624
1623
1622
1621
1620
1619
1618
1617
1616
1615
1614
1613
1612
1611
1610
1609
1608
1607
1606
1605
1604
1603
1602
1601
1600
1599
1598
1597
1596
1595
1594
1593
1592
1591
1590
1589
1588
1587
1586
1585
1584
1583
1582
1581
1580
1579
1578
1577
1576
1575
1574
1573
1572
1571
1570
1569
1568
1567
1566
1565
1564
1563
1562
1561
1560
1559
1558
1557
1556
1555
1554
1553
1552
1551
1550
1549
1548
1547
1546
1545
1544
1543
1542
1541
1540
1539
1538
1537
1536
1535
1534
1533
1532
1531
1530
1529
1528
1527
1526
1525
1524
1523


In [None]:
len(soups_parsed)

1723

In [None]:
# saving in pickle to preserve BS4 type

# with open('../data/scraped_n26_felipe.pkl', 'wb') as file:
#   pickle.dump(soups_parsed, file)

# soup_df = pd.DataFrame(soups_parsed, columns=['url', 'market', 'soup'])

# soup_df.to_csv('../data/scraped_n26_felipe.csv')

In [6]:
with open('../data/scraped_n26_felipe.pkl', 'rb') as file:
    soups_parsed = pickle.load(file)
    
len(soups_parsed)

1723

### Extract fields (10%)

In [7]:
def extract_qnas(urls_and_markets, **kwargs):
    # This function indentifies and extracts the question and
    # answer pairs from a scraped content
    
    #Parameters:
    #    parsed_data list[(url_1,market_1,html_1),(url_2,market_2,html_2),..]): 
    #    urls, markets and scraped content.
    
    #Returns:
    #    extracted_data (list[
    #                      (market_1,url_1,title_1,content_1),
    #                      (market_2,url_2,title_2,content_2)])
    #    This contains the extracted fields that we care about
    
    extracted_data = []
    
    for url, market, soup in urls_and_markets:
        everything = soup.find('div', {'id':'main'})
        
        if everything is not None:
            
            question = everything.find('h1').text.lower().replace('\\', ' ').replace('xa0', ' ')
            
            answer = ' '.join([element.text.lower().replace("\'", "’").replace('\xa0', ' ')
                               for element in everything.h1.next_siblings
                               if not element.get_text(' ', strip=True).endswith('?')])

            extracted_data += [(url, market, question, answer)]
        
    return extracted_data 


extracted_data = extract_qnas(soups_parsed)


# If stuck, use the below csv
# extracted_n26 = pd.read_csv('../data/extracted_n26.csv')
# extracted_data = extracted_n26.values.tolist()

In [8]:
len(extracted_data)

1722

In [10]:
extracted_data[:2]

[('https://support.n26.com/it-it/app-e-opzioni/spazi/cos-e-il-potere-legale',
  'it-it',
  "spazi condivisi - cos'è il potere legale?",
  'il modello legale utilizzato per spazi condivisi si chiama potere legale (kontovollmacht in tedesco).  i partecipanti di uno spazio condiviso non hanno pari proprietà del conto come un conto condiviso nel senso bancario tradizionale. dopo aver ricevuto il potere legale i partecipanti ottengono il diritto di agire a nome dell’amministratore, per tutto ciò che riguarda il trasferimento dei fondi entro lo spazio condiviso.  la persona che autorizza l’altro ad agire per suo conto è denominata "amministratore", questa persona è il proprietario legale del conto. l’amministratore deve essere un titolare di un conto premium n26, ad esempio n26 you, n26 metal e n26 business you. ciò significa che chiunque abbia creato lo spazio condiviso ne è il suo proprietario legale. l’amministratore di uno spazio condiviso:  - e’ il proprietario di tutti i fondi presenti

In [None]:
# Saving
# extract_df = pd.DataFrame(extracted_data, columns=['url', 'market', 'questions', 'answers'])
# extract_df.to_csv('../data/extracted_n26_felipe.csv')

In [12]:
extract_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1397 entries, 0 to 1720
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   url        1397 non-null   object
 1   market     1397 non-null   object
 2   questions  1397 non-null   object
 3   answers    1397 non-null   object
dtypes: object(4)
memory usage: 54.6+ KB


# Finding closest matches (60%)

In [5]:
# Load embeddings from web
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
# model = hub.load(module_url)
# print ("module %s loaded" % module_url)

KeyboardInterrupt: 

In [2]:
# Load from local, if downloaded

model = hub.load('../universal-sentence-encoder_4')
print ("module %s loaded" % model)

def embed(input):
    return model(input)

module <tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject object at 0x7fcb4819a850> loaded


In [113]:
extracted_n26 = pd.read_csv('../data/extracted_n26_felipe.csv', index_col=0)
extracted_n26.head()

Unnamed: 0,url,market,questions,answers
0,https://support.n26.com/it-it/app-e-opzioni/sp...,it-it,spazi condivisi - cos'è il potere legale?,il modello legale utilizzato per spazi condivi...
1,https://support.n26.com/it-it/sicurezza/passwo...,it-it,come modificare il mio pin di conferma?,puoi modificare il tuo pin di conferma in qual...
2,https://support.n26.com/en-eu/security/passwor...,en-eu,why didn't i receive my pairing code via sms?,a pairing code is required to connect your sma...
3,https://support.n26.com/en-it/memberships-and-...,en-it,how to cancel my n26 you or n26 metal membership?,n26 you(new tab) or n26 metal(new tab) contrac...
4,https://support.n26.com/it-it/conti-e-sottoscr...,it-it,come funziona n26 business metal?,"ℹ️ vale per austria, belgio, danimarca, estoni..."


In [71]:
def match_qas(qas_across_markets):
    # This function assigns an faq_id to the input data, thus grouping question
    # answer pairs across languages. A locale is the language of a market
    
    #Parameters:
    #    qas_across_markets (list[
    #                      (market_1,url_1,title_1,content_1),
    #                      (market_2,url_2,title_2,content_2)])
    
    
    #Returns:
    #    matched_data (list[
    #                      (faq_id_1,locale_1,market_1,title_1,content_1),
    #                      (faq_id_1,locale_2,market_2,title_2,content_2)])
    
    # set up dataframe
    qas_across_markets.columns = ['url', 'market', 'title', 'content']
    qas_across_markets['locale'] = qas_across_markets['market'].str[:2]
    qas_across_markets['faq_ids'] = [0] * len(qas_across_markets)
    qas_across_markets = qas_across_markets.drop_duplicates().dropna().reset_index(drop=True)

    # create matrix with similarities
    embeddings = embed(qas_across_markets['title'])
    similarity = np.inner(embeddings, embeddings)

    # compare each sentence in english with the sentences in other languages,
    # setting group according to similarity greater than 0.4, and the greatest
    # similarity with english within each language
    languages = ['de', 'fr', 'es', 'it']
    group = 0

    for en_index, en_row in qas_across_markets[qas_across_markets['locale'] == 'en'].iterrows():
        group += 1
        qas_across_markets['faq_ids'][en_index] = group

        for lang in languages:
            lang_rows = qas_across_markets[qas_across_markets['locale'] == lang].index

            sim_tups = [(similarity[en_index, lang_index], lang_index)
                        for lang_index in lang_rows 
                          if similarity[en_index, lang_index] > 0.4]

            if len(sim_tups) > 0:
                for sim, ind in sim_tups:
                    if sim == max(sim_tups)[0]:
                        qas_across_markets['faq_ids'][ind] = group 


    matched_data = (list(qas_across_markets['faq_ids']),
                    list(qas_across_markets['locale']),
                    list(qas_across_markets['market']),
                    list(qas_across_markets['title']),
                    list(qas_across_markets['content']))
    
    return matched_data
    
final_results = match_qas(extracted_n26.copy())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qas_across_markets['faq_ids'][en_index] = group
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qas_across_markets['faq_ids'][ind] = group


In [400]:
qas_across_markets = extracted_n26.copy()
qas_across_markets.columns = ['url', 'market', 'title', 'content']
qas_across_markets['locale'] = qas_across_markets['market'].str[:2]
qas_across_markets['faq_ids'] = [0] * len(qas_across_markets)
qas_across_markets = qas_across_markets.drop_duplicates('title').dropna().reset_index(drop=True)

# create matrix with similarities, taking into account title and content
embeddings_t = embed(qas_across_markets['title'])
embeddings_c = embed(qas_across_markets['content'])
embeddings = (embeddings_t**2 + embeddings_c)
simil = np.inner(embeddings, embeddings)

# compare each sentence in english with the sentences in other languages,
# setting group according to similarity greater than 0.01, and the greatest
# similarity with english within each language
# faq_id will be zero for those with similarity >= 0.01
languages = ['de', 'fr', 'es', 'it']
group = 0

for en_index, en_row in qas_across_markets[qas_across_markets['locale'] == 'en'].iterrows():
    group += 1
    qas_across_markets['faq_ids'].loc[en_index] = group

    for lang in languages:
        lang_rows = qas_across_markets[qas_across_markets['locale'] == lang].index
        
        
        
        sim_tups = [(simil[en_index, lang_index], lang_index)
                    for lang_index in lang_rows] 
                      #if simil[en_index, lang_index] > 0.0001]

        if len(sim_tups) > 0:
            for sim, ind in sim_tups:
                if sim == max(sim_tups)[0]:
                    qas_across_markets['faq_ids'][ind] = group 


matched_data = (list(qas_across_markets['faq_ids']),
                list(qas_across_markets['locale']),
                list(qas_across_markets['market']),
                list(qas_across_markets['title']),
                list(qas_across_markets['content']))

# treat the faq_ids 0

for ind in qas_across_markets[(qas_across_markets['faq_ids'] == 0) & (qas_across_markets['locale'] != 'en')].index:
    np.argsort(simil[ind])[::-1][0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qas_across_markets['faq_ids'][ind] = group


In [413]:
embeddings_t = embed(qas_across_markets['title'])
embeddings_c = embed(qas_across_markets['content'])
embeddings = (embeddings_t**2 + embeddings_c)
simil = np.inner(embeddings, embeddings)

In [414]:
simil[7][368], simil[368][7], simil[368][178]

(0.7512899, 0.7512899, 0.45824006)

In [415]:
np.argsort(simil[368])[::-1][1:5]

array([  7, 362, 513, 127])

In [416]:
[i for i in np.argsort(simil[368])[::-1] if qas_across_markets.iloc[i]['locale'] == 'en'][:5]

[7, 174, 37, 99, 572]

In [383]:
cosine_distance(embeddings[7], embeddings[368])

0.4628046607888717

In [384]:
qas_across_markets.loc[7]

url        https://support.n26.com/en-eu/payments-transfe...
market                                                 en-eu
title                  why was i charged more than expected?
content    all transactions take place in real-time. we’l...
locale                                                    en
faq_ids                                                    3
Name: 7, dtype: object

In [385]:
qas_across_markets.loc[368]

url        https://support.n26.com/fr-fr/paiements-vireme...
market                                                 fr-fr
title      pourquoi ai-je été débité d’un montant qui ne ...
content    n26 fonctionne en temps réel. nous vous inform...
locale                                                    fr
faq_ids                                                  173
Name: 368, dtype: object

In [386]:
qas_across_markets.loc[178]

url        https://support.n26.com/en-us/payments-transfe...
market                                                 en-us
title      will my direct debit go through if i am low on...
content    if your account has insufficient funds when a ...
locale                                                    en
faq_ids                                                   77
Name: 178, dtype: object

In [387]:
qas_across_markets.head(20)#loc[1340]

Unnamed: 0,url,market,title,content,locale,faq_ids
0,https://support.n26.com/it-it/app-e-opzioni/sp...,it-it,spazi condivisi - cos'è il potere legale?,il modello legale utilizzato per spazi condivi...,it,0
1,https://support.n26.com/it-it/sicurezza/passwo...,it-it,come modificare il mio pin di conferma?,puoi modificare il tuo pin di conferma in qual...,it,0
2,https://support.n26.com/en-eu/security/passwor...,en-eu,why didn't i receive my pairing code via sms?,a pairing code is required to connect your sma...,en,1
3,https://support.n26.com/en-it/memberships-and-...,en-it,how to cancel my n26 you or n26 metal membership?,n26 you(new tab) or n26 metal(new tab) contrac...,en,2
4,https://support.n26.com/it-it/conti-e-sottoscr...,it-it,come funziona n26 business metal?,"ℹ️ vale per austria, belgio, danimarca, estoni...",it,0
5,https://support.n26.com/de-de/app-und-produkte...,de-de,warum sind einige produkte und dienstleistunge...,wir arbeiten mit vielen partnern zusammen. bes...,de,0
6,https://support.n26.com/it-it/app-e-opzioni/mo...,it-it,come utilizzare moneybeam?,moneybeam consente di inviare denaro ai contat...,it,0
7,https://support.n26.com/en-eu/payments-transfe...,en-eu,why was i charged more than expected?,all transactions take place in real-time. we’l...,en,3
8,https://support.n26.com/de-de/konto-und-person...,de-de,kann ich in meinem land ein n26 konto eröffnen?,wir bieten unsere konten in folgenden ländern ...,de,0
9,https://support.n26.com/de-de/app-und-produkte...,de-de,wie kontaktiere ich n26?,"wenn du uns erreichen möchtest, kannst du eine...",de,0


In [391]:
qas_across_markets[(qas_across_markets['faq_ids'] == 3)]#[:].loc[1]

Unnamed: 0,url,market,title,content,locale,faq_ids
7,https://support.n26.com/en-eu/payments-transfe...,en-eu,why was i charged more than expected?,all transactions take place in real-time. we’l...,en,3


In [389]:
# Dump file in a csv called n26_with_faq_ids

df = pd.DataFrame(np.array(final_results).T, columns=['faq_id', 'lang', 'market', 'question', 'answer'])
# df.to_csv('../data/n26_with_faq_ids.csv')
df.head()

Unnamed: 0,faq_id,lang,market,question,answer
0,0,it,it-it,spazi condivisi - cos'è il potere legale?,il modello legale utilizzato per spazi condivi...
1,0,it,it-it,come modificare il mio pin di conferma?,puoi modificare il tuo pin di conferma in qual...
2,1,en,en-eu,why didn't i receive my pairing code via sms?,a pairing code is required to connect your sma...
3,2,en,en-it,how to cancel my n26 you or n26 metal membership?,n26 you(new tab) or n26 metal(new tab) contrac...
4,0,it,it-it,come funziona n26 business metal?,"ℹ️ vale per austria, belgio, danimarca, estoni..."


In [390]:
df[df['faq_id'] == '0']

Unnamed: 0,faq_id,lang,market,question,answer
0,0,it,it-it,spazi condivisi - cos'è il potere legale?,il modello legale utilizzato per spazi condivi...
1,0,it,it-it,come modificare il mio pin di conferma?,puoi modificare il tuo pin di conferma in qual...
4,0,it,it-it,come funziona n26 business metal?,"ℹ️ vale per austria, belgio, danimarca, estoni..."
5,0,de,de-de,warum sind einige produkte und dienstleistunge...,wir arbeiten mit vielen partnern zusammen. bes...
6,0,it,it-it,come utilizzare moneybeam?,moneybeam consente di inviare denaro ai contat...
...,...,...,...,...,...
1380,0,it,it-it,quali sono i vantaggi di n26 metal?,"ℹ️disponibile in germania, austria, francia, s..."
1387,0,it,it-it,come scaricare i miei dati personali?,il general data protection regulation (gdpr) è...
1388,0,it,it-it,come passare a n26 you?,"ℹ️ disponibile in austria, belgio, danimarca, ..."
1390,0,es,es-es,¿quién puede obtener una basiskonto?,puedes abrir una cuenta n26 gratuita desde nue...


In [52]:
df[df['faq_id'] == '768']

Unnamed: 0,faq_id,lang,market,question,answer
934,768,fr,fr-fr,comment contester une transaction ?,cet article vous guidera afin d’effectuer une ...
1366,768,en,en-fr,how to dispute a transaction?,is there a card transaction on your n26 monthl...


In [53]:
df[df['faq_id'] == '679']

Unnamed: 0,faq_id,lang,market,question,answer
205,679,fr,fr-fr,comment inviter un ami à rejoindre n26 ?,vous pouvez commencer à inviter des amis dès q...
1216,679,en,en-it,how to earn money by inviting friends to n26?,you can start inviting friends as soon as you ...


# Clustering by Language

In [None]:
def match_lang(qas_across_markets):
    # This function assigns an faq_id to the input data, thus grouping question
    # answer pairs across languages. A locale is the language of a market
    
    #Parameters:
    #    qas_across_markets (list[
    #                      (market_1,url_1,title_1,content_1),
    #                      (market_2,url_2,title_2,content_2)])
    
    
    #Returns:
    #    matched_data (list[
    #                      (faq_id_1,locale_1,market_1,title_1,content_1),
    #                      (faq_id_1,locale_2,market_2,title_2,content_2)])
    
    matched_data = []

    qas_across_markets.columns = ['url', 'market', 'title', 'content']
    qas_across_markets['locale'] = qas_across_markets['market'].str[:2]
    # added groups according to url to check clusters latter on
    qas_across_markets['groups'] = qas_across_markets['url'].apply(lambda x: x.split('/')[4].replace('-', ' ') if len(x.split('/')) > 4 else '')

    #sorting by locale, so embeddings will remain in same order
    qas_across_markets.sort_values('locale', inplace=True)

    embeddings = []

    for lang in sorted(qas_across_markets['locale'].unique()):
        temp = qas_across_markets[qas_across_markets['locale'] == lang]['title']
        embeded = list(embed(temp))
        embeddings += embeded   

    # using kmeans to cluster closest embeddings (10 clusters to represent the 10 questions segments)    
    kmeans = KMeansClusterer(10, distance=cosine_distance, repeats=20, avoid_empty_clusters=True)
    clusters = kmeans.cluster(embeddings, assign_clusters=True)
    qas_across_markets['faq_ids'] = clusters

    matched_data = (list(qas_across_markets['faq_ids']),
                    list(qas_across_markets['locale']),
                    list(qas_across_markets['market']),
                    list(qas_across_markets['title']),
                    list(qas_across_markets['content']),
                    list(qas_across_markets['groups']))
    
    return matched_data