In [505]:
import pandas as pd
import re
from difflib import SequenceMatcher    # https://docs.python.org/3/library/difflib.html#difflib.get_close_matches

# Unternehmensliste Interflex

In [524]:
ulist = pd.read_excel("Unternehmensliste_Interflex.xlsx")
ulist.head()

Unnamed: 0,Firmenname,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID
0,Bistum Essen,Deutschland,45127,Essen,Zwölfling 16,,801695.0
1,KBS Kleider Bauer Betriebs-GmbH,Österreich,2380,Perchtoldsdorf,Zwingenstr. 5,,850801.0
2,Elektro Ing-Plan GmbH Dresden,Deutschland,1187,Dresden,Zwickauer Straße 88,,851014.0
3,Netzdesign-Vobornik,Deutschland,71083,Herrenberg,Zwickauer Strasse 41,,
4,NILES-SIMMONS Industrieanlagen GmbH,Deutschland,9117,Chemnitz,Zwickauer Straße 355,DE140853999,802541.0


In [525]:
ulist.duplicated().sum()

15

In [526]:
ulist.nunique()

Firmenname            14193
Land                     73
Postleitzahl           4946
Ort                    4239
Straße und Hausnr.    12702
USt.-IdNr.             4427
ID                    10852
dtype: int64

In [527]:
ulist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 7 columns):
Firmenname            14550 non-null object
Land                  13477 non-null object
Postleitzahl          14238 non-null object
Ort                   13902 non-null object
Straße und Hausnr.    14111 non-null object
USt.-IdNr.            4849 non-null object
ID                    10875 non-null object
dtypes: object(7)
memory usage: 795.8+ KB


In [534]:
ulist = (
    ulist
        .assign(
            Land=ulist['Land'].astype('category'),
            # Postleitzahl=ulist['Postleitzahl'].astype('Int64'),
            # ID=ulist['ID'].astype('Int64')   # not working
        )   # .info()
)

In [532]:
ulist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14090 entries, 0 to 14089
Data columns (total 7 columns):
Firmenname            14090 non-null object
Land                  13026 non-null object
Postleitzahl          13782 non-null object
Ort                   13448 non-null object
Straße und Hausnr.    13661 non-null object
USt.-IdNr.            4655 non-null object
ID                    10485 non-null object
dtypes: object(7)
memory usage: 770.7+ KB


In [540]:
# pipeline
# ulist.duplicated().sum()   # remove duplicated rows in the dataframe
ulist["Firmenname"] = ulist["Firmenname"].str.lower()   # faster
ulist["Firmenname"] = ulist["Firmenname"].str.replace('\W', '')      # special characters are removed
ulist["Land"] = ulist["Land"].str.upper()
ulist["Land"] = ulist["Land"].str.replace('\W', '').astype('category')
ulist["Ort"] = ulist["Ort"].str.lower()
ulist["Ort"] = ulist["Ort"].str.replace('\W', '')
ulist['USt.-IdNr.'] = ulist['USt.-IdNr.'].str.replace(" ","")        # white space removed
ulist['ID'] = ulist['ID'].astype('str').str.replace('[^0-9]', '')    # ID with only numbers
# ulist['ID'] = ulist['ID'].astype('str').replace(r'\D+', '', regex=True)
ulist['ID'] = ulist['ID'].replace(r'^\s*$', np.NaN, regex=True)      # to keep missing values instead of empty strings

In [521]:
# a = list(set(ulist[ulist.loc[:,['Firmenname', 'Land', 'Ort']].duplicated()].index) - set(ulist[ulist.loc[:,['Firmenname', 'Land', 'Postleitzahl', 'Ort']].duplicated()].index))
# a.sort()
# ulist.loc[a]

In [541]:
# moved NA of 'USt.-IdNr.' and 'ID' columns to bottom of df so that rows with more NA's will be removed because they are at the bottom.
sorted_duplicates = ulist.sort_values(by=["USt.-IdNr.", "ID"], na_position='last')      # move NAs to the bottom of the df
ulist = sorted_duplicates.drop_duplicates(subset=["Firmenname", "Land", "Ort"],
                                     keep="first").sort_index().reset_index(drop=True)  # drop duplicates

In [543]:
ulist[ulist["Firmenname"].str.startswith('rweiss')]   # Actually both belongs to R.WEISS Packaging GmbH & Co. KG

Unnamed: 0,Firmenname,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID
24,rweissmaschinenbaugmbh,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28-30,DE190976983,181817
25,rweissautomationgmbhcokg,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28,,803402


In [569]:
# difficult to catch a company with 'e.K' suffix at this point. E.g:Elektro-Netzwerk Ramsauer e.K vs. Bezirksamt Wandsbek
suffix = ['gmbh', 'gmbhco', 'gmbhcokg' 'cokg', 'ag', 'mbh', 'ek']   # at the end of a company name  .str.endswith()
middle = ['gmbh', 'stiftung', 'holding']    # can be found in the middle of a company name   .str.contains()    + city names

I might remove suffix from company names before converting company names to lower case and removing special characters. Because removing spaces from company names makes it difficult to catch whether it is suffix or not. For example, searching company names ending with 'e.K'

In [568]:
ulist[ulist['Firmenname'].str.endswith('ek')]

Unnamed: 0,Firmenname,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID
97,elektronetzwerkramsauerek,DEUTSCHLAND,84149,velden,Ziegeleistraße 20,DE128911717,171228.0
177,trigionbrandenbeveiligingstechniek,NIEDERLANDE,1014 BM,amsterdam,Zekeringstraat 1,,331059.0
1581,challengeinternetworkingcomputerconsultingek,DEUTSCHLAND,82131,stockdorf,Südstr. 8,,804331.0
1889,tischlereiberndvolquardsenek,DEUTSCHLAND,25866,husumrosendahl,Stampmöhlenkamp 3,,452509.0
1982,amtsgerichthamburgbarmbek,DEUTSCHLAND,22083,hamburg,Spohrstraße 6,,
2493,bezirksamtwandsbek,DEUTSCHLAND,22041,hamburg,Schloßstraße 8 g,,804064.0
2709,amtsgerichthamburgwandsbek,DEUTSCHLAND,22041,hamburg,Schädlerstraße 28,,804643.0
2811,luxlogistikek,,66450,bexbach,Saarpfalz Park 109,,
3953,koninklijkebibliotheek,NIEDERLANDE,2509 LK,sgravenhage,Postbus 90407,,1250989.0
4810,hofmannbestattungeninhsandrairlmeierek,DEUTSCHLAND,35305,grünberg,Neupforte 18,,850907.0


In [1]:
#ulist[ulist['Firmenname'].str.endswith(tuple(suffix))]

In [2]:
#ulist[ulist['Firmenname'].str.contains('stiftung')]

In [None]:
# Next step is including companies from other data sets

# My questions so far
1. This dataset includes mostly German companies, while other datasets have no German companies. So, there will be mostly one-to-one mapping from this dataset. Is it normal or are there companies from other datasets to be matched with this dataset even though they are in different countries?
2. Country names are in German in this dataset. Should I convert them to English?
3. Are USt.-IdNr. and ID columns in this dataset important? Or they will be dropped?
4. What should many-to-one function return with the company name? Country, city etc?