In [1]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher    # https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
# from cleanco import cleanco   # would ne useful

# Unternehmensliste Interflex

In [233]:
ulist = pd.read_excel("Unternehmensliste_Interflex.xlsx")
ulist.head()

Unnamed: 0,Firmenname,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID
0,Bistum Essen,Deutschland,45127,Essen,Zwölfling 16,,801695.0
1,KBS Kleider Bauer Betriebs-GmbH,Österreich,2380,Perchtoldsdorf,Zwingenstr. 5,,850801.0
2,Elektro Ing-Plan GmbH Dresden,Deutschland,1187,Dresden,Zwickauer Straße 88,,851014.0
3,Netzdesign-Vobornik,Deutschland,71083,Herrenberg,Zwickauer Strasse 41,,
4,NILES-SIMMONS Industrieanlagen GmbH,Deutschland,9117,Chemnitz,Zwickauer Straße 355,DE140853999,802541.0


In [235]:
ulist.duplicated().sum()

15

In [236]:
ulist.nunique()

Firmenname            14193
Land                     73
Postleitzahl           4946
Ort                    4239
Straße und Hausnr.    12702
USt.-IdNr.             4427
ID                    10852
dtype: int64

In [237]:
ulist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 7 columns):
Firmenname            14550 non-null object
Land                  13477 non-null object
Postleitzahl          14238 non-null object
Ort                   13902 non-null object
Straße und Hausnr.    14111 non-null object
USt.-IdNr.            4849 non-null object
ID                    10875 non-null object
dtypes: object(7)
memory usage: 795.8+ KB


In [238]:
ulist = (
    ulist
        .assign(
            Land=ulist['Land'].astype('category'),
            # Postleitzahl=ulist['Postleitzahl'].astype('Int64'),
            # ID=ulist['ID'].astype('Int64')   # not working
        )   # .info()
)

In [239]:
ulist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 7 columns):
Firmenname            14550 non-null object
Land                  13477 non-null category
Postleitzahl          14238 non-null object
Ort                   13902 non-null object
Straße und Hausnr.    14111 non-null object
USt.-IdNr.            4849 non-null object
ID                    10875 non-null object
dtypes: category(1), object(6)
memory usage: 699.4+ KB


**Aim is to remove suffix and rest** ==> E.g: Sensient Imaging Technologies GmbH Chemiepark Bitterfeld-Wolfen

In [240]:
ulist.insert(loc=1, column='Firmenname_clean', value=ulist["Firmenname"].str.lower())

# ulist["Firmenname_clean"] = ulist["Firmenname"].str.lower()   # faster
# ulist[ulist['Firmenname'].str.endswith(tuple(suffix_list))] some finishes with gmbh some -gmbh and some even continues

In [241]:
ulist = ulist.assign(Idtrack=lambda ulist: range(1, len(ulist)+1))
ulist_original = ulist.copy()

In [243]:
# Suffix list for company names   (ordering is important! Will be fixed)
suffix_list = ['gmbh', 'gmbh&co.', 'ggmbh', 'gmbh+co.', 'kg-gmbh', 'gmbh&co', 'ltd.', 'se',
      'gmbh&co.kg', 'gmbh&cokg', 'gmbh.', 'gmbh,', 'gmbh&c', '(gmbh',
               'company', 'incorporated', 'corporation', 'corp.', 'corp', 'inc',
      '& co.', '& co',  'inc.', 's.p.a.', 'n.v.', 'a.g.', 'ag', 'nuf', 's.a.', 's.f.',
      'oao', 'co.', 'co',
              'soc.col.', 'stg', 'd.n.o.', 'ltda.', 'v.o.s.', 'a spol.',
      u've\xc5\x99. obch. spol.', 'kgaa', 'o.e.', 's.f.', 's.n.c.', 's.a.p.a.', 'j.t.d.',
      'v.o.f.', 'sp.j.', 'og', 'sd', ' i/s', 'ay', 'snc', 'oe', 'bt.', 's.s.', 'mb',
      'ans', 'da', 'o.d.', 'hb', 'pt',
              'unltd', 'ultd', 'sal', 'unlimited', 'saog', 'saoc', 'aj',
      'yoaj', 'oaj', 'akc. spol.', 'a.s.',
              'esv', 'gie', 'kv.', 'qk',
              'pty. ltd.', 'pty ltd', 'ltd', 'l.t.d.', 'bvba', 'd.o.o.', 'ltda', 'gmbh',
      'g.m.b.h', 'kft.', 'kht.', 'zrt.', 'ehf.', 's.a.r.l.', 'd.o.o.e.l.', 's. de r.l.',
      'b.v.', 'tapui',
      'sp. z.o.o.', 'sp. z o.o.', 'spółka z o.o.',
      's.r.l.', 's.l.', 's.l.n.e.', 'ood', 'oy', 'rt.',
      'teo', 'uab', 'scs', 'sprl', 'limited', 'bhd.', 'sdn. bhd.', 'sdn bhd', 'as',
      'lda.', 'tov', 'pp',
              'pllc', 'llc', 'l.l.c.', 'plc.', 'plc', 'hf.', 'oyj',
      'a.e.', 'nyrt.', 'p.l.c.', 'sh.a.', 's.a.', 's.r.l.', 'srl.', 'srl', 'aat', '3at', 'd.d.',
      's.r.o.', 'spol. s r.o.', 's.m.b.a.', 'smba', 'sarl', 'nv', 'sa', 'aps',
      'a/s', 'p/s', 'sae', 'sasu', 'eurl', 'ae', 'cpt', 'as', 'ab', 'asa', 'ooo', 'dat',
      'vat', 'zat', 'mchj', 'a.d.',
              'lllp', 'l.l.l.p.',
              'llp', 'l.l.p.', 'sp.p.', 's.c.a.', 's.c.s.',
              'gmbh & co. kg', 'lp', 'l.p.', 's.c.s.',
      's.c.p.a', 'comm.v', 'k.d.', 'k.d.a.', 's. en c.', 'e.e.', 's.a.s.', 's. en c.',
      'c.v.', 's.k.a.', 'sp.k.', 's.cra.', 'ky', 'scs', 'kg', 'kd', 'k/s', 'ee', 'secs',
      'kda', 'ks', 'kb','kt',
              'sicav',
              'nl',
              'vzw', 'ses.', 'gte.',
              'private', 'pte', 'xk',
              'p.c.', 'vof', 'snc',
              'pllc', 'p.l.l.c.',
              'e.u.', 's.p.', 't:mi', 'tmi', 'e.v.', 'e.c.', 'et', 'obrt',
      'fie', 'ij', 'fop', 'xt']

# Source of company suffices https://github.com/psolin/cleanco/blob/master/cleanco/termdata.py

In [244]:
#suffix_list += ['-{}'.format(x) for x in suffix_list]   # add dash symbol in front of suffices
#city_names =    # Bistum Essen ==> Bistum?

In [248]:
def suffix_remover(companyname):    # Running multiple times (2 is good) removes suffix effectively
    for i in suffix_list:
        #if i in re.split(' |-', companyname):   # doesn't work properly
        if i in companyname.split():    # tokenization
            return companyname[:companyname.index(i)-1]
        elif i in companyname.split('-'):
            return companyname[:companyname.index(i)-1]
    return companyname

In [249]:
ulist['Firmenname_clean'] = ulist['Firmenname_clean'].apply(suffix_remover)
ulist['Firmenname_clean'].head()

0                      bistum essen
1        kbs kleider bauer betriebs
2                  elektro ing-plan
3               netzdesign-vobornik
4    niles-simmons industrieanlagen
Name: Firmenname_clean, dtype: object

In [250]:
print('Remaining suffix as gmbh (most common):', len(ulist[ulist['Firmenname_clean'].str.contains('gmbh')]))

Remaining suffix as gmbh (most common): 39


In [251]:
ulist[ulist['Firmenname_clean'].str.contains('gmbh')]   # entries with typo

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
311,operational servicesGmbH & Co. KG,operational servicesgmbh &,Deutschland,1307,,Wintergartenstraße 4,,141099.0,312
554,Technimark-Eisbär GmbHKunststoff- und Metallve...,technimark-eisbär gmbhkunststoff- und metallve...,Deutschland,52477,,Werner-von-Siemens-Straße 7-9,,120800.0,555
1393,Ing. Hans Erler GmbHElektroanlagen,ing. hans erler gmbhelektroanlagen,Deutschland,90471,,Thomas-Mann-Str. 61 a,,130872.0,1394
1963,Paul Gerhardt Diakonie ServicesGmbH Service Ce...,paul gerhardt diakonie servicesgmbh service ce...,Deutschland,13589,Berlin,Stadtrandstraße 555,DE136622410,803026.0,1964
2057,Baden-Württembergische SpielbaGmbH & Co. KG,baden-württembergische spielbagmbh &,Deutschland,70567,,Spielbank Stuttgart,,181200.0,2058
2377,Media Saturn BeteiligungsgmbH Österreich,media saturn beteiligungsgmbh österreich,Österreich,2334,Vösendorf,SCS-Bürocenter /B2,ATU19417101,290461.0,2378
2851,GM Getränketechnik & MaschinenGmbH Gera,gm getränketechnik & maschinengmbh gera,Deutschland,7551,,Salzstraße 11,,140580.0,2852
3354,Robert Bosch GmbHWerk Ansbach,robert bosch gmbhwerk ansbach,Deutschland,91522,,Robert-Bosch-Str. 1,,131292.0,3355
3559,AECHTER & SOHN GMBHHerr Partsch,aechter & sohn gmbhherr partsch,Deutschland,1219,,Reicker Straße 38a,,140494.0,3560
3765,varys. Gesellschaft für Software und Abrechnun...,varys. gesellschaft für software und abrechnun...,Deutschland,7745,Jena,Prüssingstraße 35,DE209337070,802572.0,3766


In [213]:
#[x for x in ulist['Firmenname'] if any(i in x for i in suffix_list)]

# https://stackoverflow.com/questions/4843158/how-to-check-if-a-string-is-a-substring-of-items-in-a-list-of-strings

# def remafterellipsis(text):
#     for i in suffix_list:
#         wher = text.find(i)
#         if wher != -1:
#             #print(i)
#             #print(text[:wher])
#             return text[:wher]
#     return text
# 
# ulist['Firmenname'].apply(remafterellipsis)

In [259]:
# pipeline
# ulist.duplicated().sum()   # remove duplicated rows in the dataframe
ulist["Firmenname_clean"] = ulist["Firmenname_clean"].str.lower()   # faster
ulist["Firmenname_clean"] = ulist["Firmenname_clean"].str.replace('\W', '')      # special characters are removed
ulist["Land"] = ulist["Land"].str.upper()
ulist["Land"] = ulist["Land"].str.replace('\W', '').astype('category')
ulist["Ort"] = ulist["Ort"].str.lower()
ulist["Ort"] = ulist["Ort"].str.replace('\W', '')
ulist['USt.-IdNr.'] = ulist['USt.-IdNr.'].str.replace(" ","")        # white space removed
ulist['ID'] = ulist['ID'].astype('str').str.replace('[^0-9]', '')    # ID with only numbers
# ulist['ID'] = ulist['ID'].astype('str').replace(r'\D+', '', regex=True)
ulist['ID'] = ulist['ID'].replace(r'^\s*$', np.NaN, regex=True)      # to keep missing values instead of empty strings

In [260]:
# a = list(set(ulist[ulist.loc[:,['Firmenname_clean', 'Land', 'Ort']].duplicated()].index) - set(ulist[ulist.loc[:,['Firmenname_clean', 'Land', 'Postleitzahl', 'Ort']].duplicated()].index))
# a.sort()
# ulist.loc[a]

In [261]:
# moved NA of 'USt.-IdNr.' and 'ID' columns to bottom of df so that rows with more NA's will be removed because they are at the bottom.
sorted_duplicates = ulist.sort_values(by=["USt.-IdNr.", "ID"], na_position='last')      # move NAs to the bottom of the df
ulist = sorted_duplicates.drop_duplicates(subset=["Firmenname_clean", "Land", "Ort"],
                                     keep="first").sort_index().reset_index(drop=True)  # drop duplicates

In [263]:
ulist[ulist["Firmenname_clean"].str.startswith('rweiss')]   # Actually both belongs to R.WEISS Packaging GmbH & Co. KG

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
24,R. Weiss Maschinenbau GmbH,rweissmaschinenbau,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28-30,DE190976983,181817,26
25,R. Weiss Automation GmbH & Co. KG,rweissautomation,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28,,803402,28


In [264]:
len(ulist)   # it was 14090 without removing suffices

13988

In [265]:
middle = ['stiftung', 'holding']    # can be found in the middle of a company name  .str.contains() + maybe city names?

In [266]:
# sintmaartenskliniek   # city different but everything else same

In [267]:
ulist[ulist['Firmenname_clean'].str.contains('gmbh')]

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
97,QSG mbH Genthin,qsgmbhgenthin,DEUTSCHLAND,39307,genthin,Ziegeleistr. 56,DE139333681,150616,103
155,GfV Gesellschaft für Vermögensverwaltung mbH,gfvgesellschaftfürvermögensverwaltungmbh,DEUTSCHLAND,45096,essen,Zentraler Rechnungseingang,,804621,162
182,MSG mbH Bereich Management,msgmbhbereichmanagement,DEUTSCHLAND,9130,chemnitz,Zeisigwaldstraße 101,,160399,190
299,operational servicesGmbH & Co. KG,operationalservicesgmbh,DEUTSCHLAND,1307,,Wintergartenstraße 4,,141099,312
536,Technimark-Eisbär GmbHKunststoff- und Metallve...,technimarkeisbärgmbhkunststoffundmetallverarb,DEUTSCHLAND,52477,,Werner-von-Siemens-Straße 7-9,,120800,555
...,...,...,...,...,...,...,...,...,...
11896,ZEV Zwickauer Energievers.GmbH,zevzwickauerenergieversgmbh,DEUTSCHLAND,8056,zwickau,Bahnhofstraße 4,DE141379256,802588,12385
12377,ALTMANN & BÖHNING GmbHNiederlassung Dresden,altmannböhninggmbhniederlassungdresden,DEUTSCHLAND,1458,,An den Schindertannen 4,,141001,12883
12839,"Erzgebirgsklinikum gGmbH, Haus Zschopau",erzgebirgsklinikumggmbhhauszschopau,DEUTSCHLAND,9405,zschopau,Alte Marienberger Str. 52,,850387,13358
12965,Sigma Grundstücks- und Verwaltungs GmbH-vertre...,sigmagrundstücksundverwaltungsgmbhvertretendur...,DEUTSCHLAND,82027,grünwald,Albert Immobilienverw.Ges.mbH,,171092,13494


Next step is grouping duplicated entries with a unique company name. Then applying same methods to companies from other data sets. First, removing suffices and then searching duplicated companies in company name, country and city columns. My goal is to finding duplicates so that I can group them under an unique company name. After that, I will append remaining unique company names to a final dataframe or a dictionary. Later, I can come back and group the duplicated ones for each unique company and create a many-to-one mapping function.

At the end, inputing a company name will return a unique company name (+information too like country, city, revenue?) based on grouping same companies with varying names and a rule-based approach.

For example:
Input (Volkswagen) ==> Volkswagen AG   based on suffix (adding suffix and searching)

Input (Volkswagen Aktiengesellschaft) ==> Volkswagen AG    (based on grouping same companies)

Input (VW) ==> Volkswagen AG   based on a rule

In [268]:
ulist[ulist['Firmenname'].str.contains('Volkswagen')]

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
212,Volkswagen Group United kingdom Limited,volkswagengroupunitedkingdom,GROSSBRITANNIEN,MK14 5AN,miltonkeynes,Yeomans Drive,,805690.0,221
741,Volkswagen Automobile Stuttgart GmbH,volkswagenautomobilestuttgart,DEUTSCHLAND,70188,stuttgart,Wangener Str. 66,,,768
969,Volkswagen Original Teile Logistik GmbH & Co. KG,volkswagenoriginalteilelogistik,DEUTSCHLAND,34225,baunatal,Vertriebszentrum West,DE230960046,803202.0,1004
1060,Volkswagen Original Teile LogiSüdwest / Franke...,volkswagenoriginalteilelogisüdwestfranken,DEUTSCHLAND,97084,,Unterer Kirchbergweg 65,,170970.0,1097
1087,Volkswagen Infotainment GmbH,volkswageninfotainment,DEUTSCHLAND,44799,bochum,Universitätsstraße 140,DE295504619,803983.0,1126
2478,Volkswagen Business Services GmbH I-SEC; Frau ...,volkswagenbusinessservices,DEUTSCHLAND,38112,braunschweig,Schmalbachstraße 1,DE171252317,160936.0,2567
2480,Volkswagen Financial Services Digital Solution...,volkswagenfinancialservicesdigitalsolutions,DEUTSCHLAND,38112,braunschweig,Schmalbachstraße 1,DE260043656,800056.0,2569
3738,Volkswagen Immobilien GmbH,volkswagenimmobilien,DEUTSCHLAND,38440,wolfsburg,Poststr. 28,,,3883
6840,Volkswagen Osnabrück GmbH,volkswagenosnabrück,DEUTSCHLAND,49084,osnabrück,KARMANNSTRAßE 1,,803445.0,7095
7659,Volkswagen Financial Services AG,volks,DEUTSCHLAND,38112,braunschweig,IH-IMC,DE811115544,803041.0,7942


# My questions so far
1. This dataset includes mostly German companies, while other datasets have no German companies. So, there will be mostly one-to-one mapping from this dataset. Is it normal or are there companies from other datasets to be matched with this dataset even though they are in different countries?
2. Country names are in German in this dataset. Should I convert them to English?
3. Are USt.-IdNr. and ID columns in this dataset important? Or they will be dropped?
4. What should many-to-one function return along with a company name? Country, city etc?
5. Which companies to group many to one? (For example Volkswagen case.)

In [269]:
ulist

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
0,Bistum Essen,bistumessen,DEUTSCHLAND,45127,essen,Zwölfling 16,,801695,1
1,KBS Kleider Bauer Betriebs-GmbH,kbskleiderbauerbetriebs,ÖSTERREICH,2380,perchtoldsdorf,Zwingenstr. 5,,850801,2
2,Elektro Ing-Plan GmbH Dresden,elektroingplan,DEUTSCHLAND,1187,dresden,Zwickauer Straße 88,,851014,3
3,Netzdesign-Vobornik,netzdesignvobornik,DEUTSCHLAND,71083,herrenberg,Zwickauer Strasse 41,,,4
4,NILES-SIMMONS Industrieanlagen GmbH,nilessimmonsindustrieanlagen,DEUTSCHLAND,9117,chemnitz,Zwickauer Straße 355,DE140853999,802541,5
...,...,...,...,...,...,...,...,...,...
13983,UIT GmbH,uit,,,,,,,14546
13984,WALTER GROUP Service GmbH,waltergroupservice,,,,,,851535,14547
13985,Webhelp Standort: Bundesagentur für Arbeit,webhelpstandortbundesagenturfürarbeit,,,,,,851514,14548
13986,Webhelp Standort: Erfurt + HM,webhelpstandorterfurthm,,,,,,851513,14549
