In [1]:
import pandas as pd
import numpy as np
import yaml
import re
from difflib import SequenceMatcher    # https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
# !pip install cleanco
# import cleanco   # would ne useful

# Unternehmensliste Interflex

## 1. Dataset info and preprocessing

In [216]:
ulist = pd.read_excel("Unternehmensliste_Interflex.xlsx")
ulist.head()

Unnamed: 0,Firmenname,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID
0,Bistum Essen,Deutschland,45127,Essen,Zwölfling 16,,801695.0
1,KBS Kleider Bauer Betriebs-GmbH,Österreich,2380,Perchtoldsdorf,Zwingenstr. 5,,850801.0
2,Elektro Ing-Plan GmbH Dresden,Deutschland,1187,Dresden,Zwickauer Straße 88,,851014.0
3,Netzdesign-Vobornik,Deutschland,71083,Herrenberg,Zwickauer Strasse 41,,
4,NILES-SIMMONS Industrieanlagen GmbH,Deutschland,9117,Chemnitz,Zwickauer Straße 355,DE140853999,802541.0


In [217]:
# pipeline
ulist = ulist.rename(columns={'Firmenname': 'Company_name', 'Land': 'Country', 'Postleitzahl': 'Zipcode', 'Ort': 'City', 'Straße und Hausnr.': 'Address'})

ulist["Country"] = ulist["Country"].str.upper()
ulist["Country"] = ulist["Country"].str.replace('\W', '').astype('category')
ulist["City"] = ulist["City"].str.lower()
ulist["City"] = ulist["City"].str.replace('\W', '')
ulist['USt.-IdNr.'] = ulist['USt.-IdNr.'].str.replace(" ","")        # white space removed
ulist['ID'] = ulist['ID'].astype('str').str.replace('[^0-9]', '')    # ID with only numbers
# ulist['ID'] = ulist['ID'].astype('str').replace(r'\D+', '', regex=True)
ulist['ID'] = ulist['ID'].replace(r'^\s*$', np.NaN, regex=True)      # to keep missing values instead of empty strings

ulist.insert(loc=1, column='Companyname_clean', value=ulist["Company_name"].str.lower())   # New Company name column to standardize

In [218]:
#from pathlib import Path
#yaml_dict = yaml.safe_load(Path("countries_by_namede.yml.txt", encoding='utf8').read_text())

In [219]:
with open('countries_by_namede.yml.txt', 'rt', encoding='utf8') as file:   
     yaml_dict = yaml.load(file)
        
# https://stackoverflow.com/questions/58340498/reading-yaml-file-in-python-with-accents-and-special-charactets

country_dict = yaml_dict.get('de').get('countries')

# https://gist.githubusercontent.com/pex/3153011/raw/859b1b8c1d3ff5c93b5e5f2aa02694ee404141d7/countries_by_name.de.yml
country_dict = {y: x for x, y in country_dict.items()}
country_dict = {k.upper():v.upper() for k,v in country_dict.items()}

  


In [220]:
country_dict.update({'MALTAMALTA':'MALTA', 'PUERTORICOPUERTORICO': 'PUERTORICO', 'THAILANDTHAILAND': 'THAILAND', 'ISRAELISRAEL': 'ISRAEL',
                     'JORDANJORDANIEN': 'JORDAN', 'MALAYSIAMALAYSIA': 'MALAYSIA', 'GREECEGRIECHENLAND': 'GREECE', 'MACEDONIAMAZEDONIEN': 'MACEDONIA',
                     'KUWAITKUWAIT': 'KUWAIT', 'SINGAPORESINGAPUR': 'SINGAPORE', 'QATARKATAR': 'QATAR', 'KAZAKHSTANKASACHSTAN': 'KAZAKHSTAN',
                    'JAPANJAPAN': 'JAPAN', 'SOUTHKOREAKOREASÜD': 'SOUTHKORE', 'VIETNAMVIETNAM': 'VIETNAM', 'SERBIASERBIEN': 'SERBIA',
                    'PHILIPPINESPHILIPPINEN': 'PHILIPPINES', 'NEWZEALANDNEUSEELAND': 'NEWZEALAND', 'RUSSLAND': 'RUSSIANFEDERATION', 'UA': 'UNITEDARAB_EMIRATES',
                     'TSCHECHIEN': 'CZECHREPUBLIC', 'VEREINIGTEEMIRATE': 'UNITEDARAB_EMIRATES', 'KÖNIGREICHSAUDIARABIEN': 'SAUDIARABIA', 'ESTLAND': 'ESTONIA'})

In [221]:
# list(set(ulist['Country'].unique()) - set(country_dict.keys()))
# sorted(list(set(country_dict.keys())- set(ulist['Country'].unique())))
print('Number of missing values in country column before=', ulist['Country'].map(country_dict).fillna(ulist['Country']).isna().sum())
print('Number of missing values in country column after mapping Eng-De country names=', ulist['Country'].map(country_dict).isna().sum())
ulist.loc[ulist['Country'].isna() != ulist['Country'].map(country_dict).isna()]


Number of missing values in country column before= 1073
Number of missing values in country column after mapping Eng-De country names= 1085


Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
994,BMW Sverige Aktiebolag,bmw sverige aktiebolag,SWEDEN,191 27,sollentuna,Vetenskapsvägen 10,SE556313500201,171249.0
2158,OKG Aktiebolag,okg aktiebolag,SWEDEN,57283,oskarshamm,Simpevarp,SE556063372801,170964.0
5242,QUALITY INTERIORS & AUTOMATION SPECIALIST,quality interiors & automation specialist,TURKSCAICOSISLANDS,TKCA 1ZZ,turkscaicosislands,"MORRIS PLAZA, SUITE #8",,850539.0
5896,ABB Power Grids Sweden AB,abb power grids sweden ab,SWEDEN,77180,ludvika,Lyviksvägen 3,,
6114,PT. Schaeffler Bearings Indonesia,pt. schaeffler bearings indonesia,INDONESIA,12920,jakarta,Lippo Kuningan 19th Floor Unit A & F,,
9570,Aviator,aviator,SWEDEN,19046,stockholm,Generatorgatan 11,,165160.0
10020,Brose Sweden AB,brose sweden ab,SWEDEN,42337,,Flygfältsgatan 4,,135228.0
11352,Boehringer Ingelheim Colombia,boehringer ingelheim colombia,COLOMBIA,,bogotadc,Carrera 11 N 84-09 Piso 5 Torre Sur,860000753-8,804487.0
13580,Bombardier Transportation Sweden AB,bombardier transportation sweden ab,SWEDEN,SE-721 73,västeras,Accounts Payable Västeras,,452518.0
13910,Schaeffler Philippines Inc,schaeffler philippines inc,PHILIPPINES,1229,makaticity,221 Salcedo Street,,851866.0


Country names of remaining 12 rows are already in English!

In [222]:
ulist['Country'] = ulist['Country'].map(country_dict).fillna(ulist['Country'])

In [223]:
ulist.duplicated().sum()   # 16 duplicated entries are already exist

16

In [224]:
ulist.nunique()    # 73 different countries can be categorized

Company_name         14193
Companyname_clean    14154
Country                 67
Zipcode               4946
City                  3748
Address              12702
USt.-IdNr.            4242
ID                   10850
dtype: int64

In [225]:
ulist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 8 columns):
Company_name         14550 non-null object
Companyname_clean    14550 non-null object
Country              13477 non-null object
Zipcode              14238 non-null object
City                 13902 non-null object
Address              14111 non-null object
USt.-IdNr.           4849 non-null object
ID                   10873 non-null object
dtypes: object(8)
memory usage: 909.5+ KB


In [226]:
ulist = (
    ulist
        .assign(
            Country=ulist['Country'].astype('category'),
            # Zipcode=ulist['Zipcode'].astype('Int64'),
            # ID=ulist['ID'].astype('Int64')   # not working
        )   # .info()
)

In [227]:
ulist.info()   # memory usage decreased by categorizing country (Country) column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 8 columns):
Company_name         14550 non-null object
Companyname_clean    14550 non-null object
Country              13477 non-null category
Zipcode              14238 non-null object
City                 13902 non-null object
Address              14111 non-null object
USt.-IdNr.           4849 non-null object
ID                   10873 non-null object
dtypes: category(1), object(7)
memory usage: 813.1+ KB


**Aim is to remove suffix and rest** ==> E.g: Sensient Imaging Technologies GmbH Chemiepark Bitterfeld-Wolfen

In [228]:
# ulist[ulist['Company_name'].str.endswith(tuple(suffix_list))] some finishes with gmbh some -gmbh and some even continues

In [229]:
ulist = ulist.assign(Idtrack=lambda ulist: range(1, len(ulist)+1))
ulist_original = ulist.copy()

## 1.1 Dataset Standardization
**Prefix, Middle, Suffix removal. Column Standardization**

In [230]:
# Suffix list for company names   (ordering is important! Will be fixed)
suffix_list = ['gmbh', 'gmbh&co.', 'ggmbh', 'gmbh+co.', 'kg-gmbh', 'gmbh&co', 'ltd.', 'se',
      'gmbh&co.kg', 'gmbh&cokg', 'gmbh.', 'gmbh,', 'gmbh&c', '(gmbh', 'mbh',
               'company', 'incorporated', 'corporation', 'corp.', 'corp', 'inc',
      '& co.', '& co',  'inc.', 's.p.a.', 'n.v.', 'a.g.', 'ag', 'nuf', 's.a.', 's.f.',
      'oao', 'co.', 'co',
              'soc.col.', 'stg', 'd.n.o.', 'ltda.', 'v.o.s.', 'a spol.',
      u've\xc5\x99. obch. spol.', 'kgaa', 'o.e.', 's.f.', 's.n.c.', 's.a.p.a.', 'j.t.d.',
      'v.o.f.', 'sp.j.', 'og', 'sd', ' i/s', 'ay', 'snc', 'oe', 'bt.', 's.s.', 'mb',
      'ans', 'da', 'o.d.', 'hb', 'pt',
              'unltd', 'ultd', 'sal', 'unlimited', 'saog', 'saoc', 'aj',
      'yoaj', 'oaj', 'akc. spol.', 'a.s.',
              'esv', 'gie', 'kv.', 'qk',
              'pty. ltd.', 'pty ltd', 'ltd', 'l.t.d.', 'bvba', 'd.o.o.', 'ltda', 'gmbh',
      'g.m.b.h', 'kft.', 'kht.', 'zrt.', 'ehf.', 's.a.r.l.', 'd.o.o.e.l.', 's. de r.l.',
      'b.v.', 'tapui',
      'sp. z.o.o.', 'sp. z o.o.', 'spółka z o.o.',
      's.r.l.', 's.l.', 's.l.n.e.', 'ood', 'oy', 'rt.',
      'teo', 'uab', 'scs', 'sprl', 'limited', 'bhd.', 'sdn. bhd.', 'sdn bhd', 'as',
      'lda.', 'tov', 'pp',
              'pllc', 'llc', 'l.l.c.', 'plc.', 'plc', 'hf.', 'oyj',
      'a.e.', 'nyrt.', 'p.l.c.', 'sh.a.', 's.a.', 's.r.l.', 'srl.', 'srl', 'aat', '3at', 'd.d.',
      's.r.o.', 'spol. s r.o.', 's.m.b.a.', 'smba', 'sarl', 'nv', 'sa', 'aps',
      'a/s', 'p/s', 'sae', 'sasu', 'eurl', 'ae', 'cpt', 'as', 'ab', 'asa', 'ooo', 'dat',
      'vat', 'zat', 'mchj', 'a.d.',
              'lllp', 'l.l.l.p.',
              'llp', 'l.l.p.', 'sp.p.', 's.c.a.', 's.c.s.',
              'gmbh & co. kg', 'lp', 'l.p.', 's.c.s.',
      's.c.p.a', 'comm.v', 'k.d.', 'k.d.a.', 's. en c.', 'e.e.', 's.a.s.', 's. en c.',
      'c.v.', 's.k.a.', 'sp.k.', 's.cra.', 'ky', 'scs', 'kg', 'kd', 'k/s', 'ee', 'secs',
      'kda', 'ks', 'kb','kt',
              'sicav',
              'nl',
              'vzw', 'ses.', 'gte.',
              'private', 'pte', 'xk',
              'p.c.', 'vof', 'snc',
              'pllc', 'p.l.l.c.',
              'e.u.', 's.p.', 't:mi', 'tmi', 'e.v.', 'e.c.', 'et', 'obrt',
      'fie', 'ij', 'fop', 'xt']

# Source of company suffices https://github.com/psolin/cleanco/blob/master/cleanco/termdata.py

In [231]:
def suffix_remover(companyname):    # Running multiple times (2 is good) removes suffix effectively
    for i in suffix_list:
        #if i in re.split(' |-', companyname):   # doesn't work properly
        if i == companyname.split()[0]:
            return ' '.join(companyname.split()[companyname.split().index(i)+1:])
        elif i in companyname.split():    # tokenization
            return ' '.join(companyname.split()[:companyname.split().index(i)])   # If first index is prefix then take rest of the string
            #return companyname.split()[:companyname.split().index(i)]

        elif i in companyname.split('-'):
            return ' '.join(companyname.split('-')[:companyname.split('-').index(i)])
            #return [x.replace(' ', '') for x in companyname.split('-')[:companyname.split('-').index(i)]]
    return companyname

ulist['Companyname_clean'] = ulist['Companyname_clean'].apply(suffix_remover)
ulist['Companyname_clean'].head(10)

0                      bistum essen
1        kbs kleider bauer betriebs
2                  elektro ing-plan
3               netzdesign-vobornik
4    niles-simmons industrieanlagen
5                    gemac chemnitz
6                            emdion
7         fiege logistik stiftung &
8              william prym holding
9                      leoni kerpen
Name: Companyname_clean, dtype: object

In [232]:
# Bug fixed
companyname='volkswagen financial services ag'
i='ag'
if i in companyname.split():
    print(companyname[:companyname.index(i)-1])
    
if i in companyname.split():
    print(' '.join(companyname.split()[:companyname.split().index(i)]))

volks
volkswagen financial services


In [233]:
# Prefix updated
companyname='as watson'
pref=['as', 'gmbh', 'ds']
  
for i in pref:
    if i == companyname.split()[0]:
        print(' '.join(companyname.split()[companyname.split().index(i)+1:]))
    elif i in companyname.split():
        print(' '.join(companyname.split()[:companyname.split().index(i)]))

watson


In [234]:
ulist[ulist['Companyname_clean'] == '']

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Idtrack


In [235]:
ulist[ulist['Company_name'] == 'AS Watson']

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Idtrack
4195,AS Watson,watson,NETHERLANDS,6666 LT,heteren,Poort van Midden Gelderland Rood 24,,331187,4196
4929,AS Watson,watson,NETHERLANDS,3972,darenswoude,Nijborg 17,,803522,4930


## Remaining suffix problem

In [236]:
print('Remaining suffix as gmbh (most common):', len(ulist[ulist['Companyname_clean'].str.contains('gmbh')]))

Remaining suffix as gmbh (most common): 41


In [237]:
ulist.iloc[564]

Company_name         BOSCH SICHERHEITSSYSTEME GMB H
Companyname_clean    bosch sicherheitssysteme gmb h
Country                                     GERMANY
Zipcode                                       85630
City                                      grasbrunn
Address                  WERNER-VON-SIEMENS-RING 10
USt.-IdNr.                              DE813474672
ID                                           902003
Idtrack                                         565
Name: 564, dtype: object

In [238]:
# ulist[ulist['Companyname_clean'].str.contains('gmbh')]   # entries with typo
# gmbh remove

# [x.replace('gmbh', '') for x in ulist['Companyname_clean'] if any(tag in x for tag in remove_list)]   # doesn't work

In [239]:
ulist["Companyname_clean"] = ulist["Companyname_clean"].str.replace('\W', '')      # special characters are removed, white space etc.

In [240]:
# Remove some prefix, middle, suffices from all companynames (clear typos: operational servicesGmbH & Co. KG)
remove_list = ['gmbh']   # stopwords approach, what about stiftung, holding etc.

def pre_middle_suffix_remove(companyname):   # appylying this function after special characters (+ white spaces) are removed
    for i in remove_list:
        if i in companyname:
            return companyname.replace(i, '')
    return companyname

ulist['Companyname_clean'] = ulist['Companyname_clean'].apply(pre_middle_suffix_remove)

In [241]:
# List of gmbh contained entries: [x for x in ulist['Companyname_clean'] if any(tag in x for tag in remove_list)]
print('Original Companyname:', ulist.loc[554, 'Company_name'])   # An Example with GmbH
print('Cleaned Companyname:', ulist.loc[554, 'Companyname_clean'])
print('Number of gmbh left:', len(ulist[ulist['Companyname_clean'].str.contains('gmbh')]))

Original Companyname: Technimark-Eisbär GmbHKunststoff- und Metallverarb.
Cleaned Companyname: technimarkeisbärkunststoffundmetallverarb
Number of gmbh left: 0


In [242]:
# a = list(set(ulist[ulist.loc[:,['Companyname_clean', 'Country', 'City']].duplicated()].index) - set(ulist[ulist.loc[:,['Companyname_clean', 'Country', 'Zipcode', 'City']].duplicated()].index))
# a.sort()
# ulist.loc[a]

## 2. Duplicates and Grouping them together

In [243]:
# moved NA of 'USt.-IdNr.' and 'ID' columns to bottom of df so that rows with more NA's will be removed because they are at the bottom.
sorted_duplicates = ulist.sort_values(by=["USt.-IdNr.", "ID"], na_position='last')      # move NAs to the bottom of the df
ulist2 = sorted_duplicates.drop_duplicates(subset=["Companyname_clean", "Country", "City"],
                                     keep="first").sort_index().reset_index(drop=True)  # drop duplicates

In [262]:
len(ulist)

14550

In [245]:
#a = list(set(sorted_duplicates.index) - set(sorted_duplicates.drop_duplicates(subset=["Companyname_clean", "Country", "City"],
#                                     keep="first").index))
# sorted_duplicates.iloc[a]

In [246]:
ulist.groupby(['Companyname_clean', 'Country', 'City']).size().sort_values(ascending=False)[:516]  # 515 companies have multiple entries

Companyname_clean          Country      City         
freieundhansestadthamburg  GERMANY      hamburg          6
landeshauptstadtstuttgart  GERMANY      stuttgart        5
fraport                    GERMANY      frankfurt        5
hiltideutschland           GERMANY      kaufering        5
interflexdatensysteme      GERMANY      stuttgart        4
                                                        ..
uniquesicherheitsdienste   SWITZERLAND  zürich           2
omoda                      NETHERLANDS  zierikzee        2
airbusdeutschland          GERMANY      hamburg          2
rossmannonline             GERMANY      großburgwedel    2
ewaldgelatine              GERMANY      badsobernheim    1
Length: 516, dtype: int64

In [247]:
# manytoone_dict=ulist.groupby(['Companyname_clean', 'Country', 'City'], dropna=False).apply(lambda x: x['Company_name'].tolist()).to_dict()
#dropna=False doesn't work in my pandas version. So, another approach==>

In [248]:
ulist['Country'] = ulist['Country'].cat.add_categories('-')   # NA values labeled as -
manytoone_dict = ulist.assign(City=lambda ulista: ulista.City.fillna("-")).assign(Country=lambda ulista: ulista.Country.fillna("-")).groupby(
    ['Companyname_clean', 'Country', 'City']).apply(lambda x: x['Company_name'].tolist()).to_dict()

In [261]:
{k: manytoone_dict[k] for k in list(manytoone_dict)[:5]}    # First 5 items in the dictionary. Not sure if it is suitable for the end function

{('1001artikelmedical', 'GERMANY', 'riesa'): ['1001 ARTIKEL MEDICAL GmbH'],
 ('11880solutions', 'GERMANY', 'rostock'): ['11 88 0 Solutions AG'],
 ('118811dienummer', 'AUSTRIA', 'wien'): ['118811 Die Nummer GmbH '],
 ('11telecommunication', 'GERMANY', 'montabaur'): ['1&1 Telecommunication SE'],
 ('11telecommunicaton',
  'GERMANY',
  'montabaur'): ['1 & 1 Telecommunicaton AG Montabaur Zentraler Rechnungseingang']}

Next step is grouping duplicated entries with a unique company name. Then applying same methods to companies from other data sets. First, removing suffices and then searching duplicated companies in company name, country and city columns. My goal is to finding duplicates so that I can group them under an unique company name. After that, I will append remaining unique company names to a final dataframe or a dictionary. Later, I can come back and group the duplicated ones for each unique company and create a many-to-one mapping function.

At the end, inputing a company name will return a unique company name (+information too like country, city, revenue?) based on grouping same companies with varying names and a rule-based approach.

For example:
Input (Volkswagen) ==> Volkswagen AG   based on suffix (adding suffix and searching)

Input (Volkswagen Aktiengesellschaft) ==> Volkswagen AG    (based on grouping same companies)

Input (VW) ==> Volkswagen AG   based on a rule

In [27]:
ulist[ulist["Companyname_clean"].str.startswith('rweiss')]   # Actually both belongs to R.WEISS Packaging GmbH & Co. KG

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
24,R. Weiss Maschinenbau GmbH,rweissmaschinenbau,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28-30,DE190976983,181817,26
25,R. Weiss Automation GmbH & Co. KG,rweissautomation,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28,,803402,28


In [28]:
# sintmaartenskliniek   # city different but everything else same

In [29]:
# ulist.iloc[7659]   # general example

In [30]:
ulist[ulist['Company_name'].str.contains('Volkswagen')]    # volksw'ag'en 'ag' case "ag" now catches correct ag (suffix one)

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
212,Volkswagen Group United kingdom Limited,volkswagengroupunitedkingdom,GROSSBRITANNIEN,MK14 5AN,miltonkeynes,Yeomans Drive,,805690.0,221
741,Volkswagen Automobile Stuttgart GmbH,volkswagenautomobilestuttgart,DEUTSCHLAND,70188,stuttgart,Wangener Str. 66,,,768
969,Volkswagen Original Teile Logistik GmbH & Co. KG,volkswagenoriginalteilelogistik,DEUTSCHLAND,34225,baunatal,Vertriebszentrum West,DE230960046,803202.0,1004
1060,Volkswagen Original Teile LogiSüdwest / Franke...,volkswagenoriginalteilelogisüdwestfranken,DEUTSCHLAND,97084,,Unterer Kirchbergweg 65,,170970.0,1097
1087,Volkswagen Infotainment GmbH,volkswageninfotainment,DEUTSCHLAND,44799,bochum,Universitätsstraße 140,DE295504619,803983.0,1126
2476,Volkswagen Business Services GmbH I-SEC; Frau ...,volkswagenbusinessservices,DEUTSCHLAND,38112,braunschweig,Schmalbachstraße 1,DE171252317,160936.0,2567
2478,Volkswagen Financial Services Digital Solution...,volkswagenfinancialservicesdigitalsolutions,DEUTSCHLAND,38112,braunschweig,Schmalbachstraße 1,DE260043656,800056.0,2569
3736,Volkswagen Immobilien GmbH,volkswagenimmobilien,DEUTSCHLAND,38440,wolfsburg,Poststr. 28,,,3883
6833,Volkswagen Osnabrück GmbH,volkswagenosnabrück,DEUTSCHLAND,49084,osnabrück,KARMANNSTRAßE 1,,803445.0,7095
7652,Volkswagen Financial Services AG,volkswagenfinancialservices,DEUTSCHLAND,38112,braunschweig,IH-IMC,DE811115544,803041.0,7942


**Think/ask Aktiengesellschaft == AG rule based or grouped same companies**

In [31]:
ulist.head()

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
0,Bistum Essen,bistumessen,DEUTSCHLAND,45127,essen,Zwölfling 16,,801695.0,1
1,KBS Kleider Bauer Betriebs-GmbH,kbskleiderbauerbetriebs,ÖSTERREICH,2380,perchtoldsdorf,Zwingenstr. 5,,850801.0,2
2,Elektro Ing-Plan GmbH Dresden,elektroingplan,DEUTSCHLAND,1187,dresden,Zwickauer Straße 88,,851014.0,3
3,Netzdesign-Vobornik,netzdesignvobornik,DEUTSCHLAND,71083,herrenberg,Zwickauer Strasse 41,,,4
4,NILES-SIMMONS Industrieanlagen GmbH,nilessimmonsindustrieanlagen,DEUTSCHLAND,9117,chemnitz,Zwickauer Straße 355,DE140853999,802541.0,5


In [32]:
# input 2-5
# output 1st column add

# Next Steps
1. Working on Prefix:
      There are ' ' strings (if suffix in the first index of a string get the rest of that string [companyname.split().index(i):]) Done!
2. Converting column names and Land (Country) column to English Done!
3. Creating a dictionary to map many to one (groupby or duplicates) Working on it... (Before that I want to fill missing city-country pairs so that same companies can be grouped more effectively.)
4. Function takes input and returns a unique company name and its' info
5. Other datasets

In [255]:
# https://raw.githubusercontent.com/shivammathur/countrycity/main/data/geo.json

# Questions
1. Aktiengesellschaft == AG how to catch it? Should I include Aktiengesellschaft in a list to be removed substring or special rule would be better in this case? (Maybe I can create a dictionary for cases like AG<==>Aktiengesellschaft, but might be difficult to generalize for other acronym-definition pairs.)
2. Many countries are missing. It is a problem for finding same companies in the dataset. Should I try to fill them as correct as possible?
3. Missing values in country and city columns making it difficult to group same companies. How can I approach these missing entries?