In [1]:
import pandas as pd
import numpy as np
import yaml
import json
import re
from difflib import SequenceMatcher    # https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
# !pip install cleanco
# import cleanco   # would ne useful

# Unternehmensliste Interflex

## 1. Dataset info and preprocessing

In [691]:
ulist = pd.read_excel("Unternehmensliste_Interflex.xlsx")
ulist = ulist.rename(columns={'Firmenname': 'Company_name', 'Land': 'Country', 'Postleitzahl': 'Zipcode', 'Ort': 'City', 'Straße und Hausnr.': 'Address'})
ulist.head()

Unnamed: 0,Company_name,Country,Zipcode,City,Address,USt.-IdNr.,ID
0,Bistum Essen,Deutschland,45127,Essen,Zwölfling 16,,801695.0
1,KBS Kleider Bauer Betriebs-GmbH,Österreich,2380,Perchtoldsdorf,Zwingenstr. 5,,850801.0
2,Elektro Ing-Plan GmbH Dresden,Deutschland,1187,Dresden,Zwickauer Straße 88,,851014.0
3,Netzdesign-Vobornik,Deutschland,71083,Herrenberg,Zwickauer Strasse 41,,
4,NILES-SIMMONS Industrieanlagen GmbH,Deutschland,9117,Chemnitz,Zwickauer Straße 355,DE140853999,802541.0


**Dataset Info**

In [692]:
ulist.duplicated().sum()   # 15 duplicated entries are already exist

15

In [693]:
ulist.nunique()    # 68 different countries can be categorized

Company_name    14193
Country            73
Zipcode          4946
City             4239
Address         12702
USt.-IdNr.       4427
ID              10852
dtype: int64

In [694]:
ulist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 7 columns):
Company_name    14550 non-null object
Country         13477 non-null object
Zipcode         14238 non-null object
City            13902 non-null object
Address         14111 non-null object
USt.-IdNr.      4849 non-null object
ID              10875 non-null object
dtypes: object(7)
memory usage: 795.8+ KB


In [695]:
ulist = (
    ulist
        .assign(
            Country=ulist['Country'].astype('category'),
            # Zipcode=ulist['Zipcode'].astype('Int64'),
            # ID=ulist['ID'].astype('Int64')   # not working
        )   # .info()
)

In [696]:
ulist.info()   # memory usage decreased by categorizing country (Country) column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 7 columns):
Company_name    14550 non-null object
Country         13477 non-null category
Zipcode         14238 non-null object
City            13902 non-null object
Address         14111 non-null object
USt.-IdNr.      4849 non-null object
ID              10875 non-null object
dtypes: category(1), object(6)
memory usage: 699.4+ KB


In [697]:
# pipeline
ulist = ulist.astype({"Country": str, "City": str})
ulist["Country"] = ulist["Country"].str.upper()
ulist["Country"] = ulist["Country"].str.replace('\W', '').astype('category')
#ulist["Country"] = ulist["Country"].str.replace('\W', '')
ulist["City"] = ulist["City"].str.lower()
ulist["City"] = ulist["City"].str.replace('\W', ' ')   # Special character with white space ==> E.g tokenize hinterbergerstraße 13 4400 steyr österreich
ulist['Zipcode'] = ulist['Zipcode'].replace(r'^\s*$', np.nan, regex=True)    # replace field that's entirely space (or empty) with NaN     #ulist.iloc[[1341, 13859]]
ulist['Address'] = ulist['Address'].replace(r'^\s*$', np.nan, regex=True)
ulist['Address'] = ulist['Address'].str.strip()
#ulist['Address'] = ulist['Address'].str.strip().replace('', np.nan)
ulist['USt.-IdNr.'] = ulist['USt.-IdNr.'].str.replace(" ","")        # white space removed
ulist['ID'] = ulist['ID'].astype('str').str.replace('[^0-9]', '')    # ID with only numbers
# ulist['ID'] = ulist['ID'].astype('str').replace(r'\D+', '', regex=True)
ulist['ID'] = ulist['ID'].replace(r'^\s*$', np.NaN, regex=True)      # to keep missing values instead of empty strings

ulist.insert(loc=1, column='Companyname_clean', value=ulist["Company_name"].str.lower())   # New Company name column to standardize

### 1.1 Translating country names from German to English

In [698]:
with open('countries_by_namede.yml.txt', 'rt', encoding='utf8') as file:   
     yaml_dict = yaml.load(file)
        
# https://stackoverflow.com/questions/58340498/reading-yaml-file-in-python-with-accents-and-special-charactets

country_dict = yaml_dict.get('de').get('countries')

# https://gist.githubusercontent.com/pex/3153011/raw/859b1b8c1d3ff5c93b5e5f2aa02694ee404141d7/countries_by_name.de.yml
country_dict = {y: x for x, y in country_dict.items()}
country_dict = {k.upper():v.upper() for k,v in country_dict.items()}
country_dict = {k.replace(' ','').replace('-',''): v.replace(' ','').replace('-','') for k, v in country_dict.items()}  # remove white space and 

# additions due to differences in dataset
country_dict.update({'MALTAMALTA':'MALTA', 'PUERTORICOPUERTORICO': 'PUERTORICO', 'THAILANDTHAILAND': 'THAILAND', 'ISRAELISRAEL': 'ISRAEL',
                     'JORDANJORDANIEN': 'JORDAN', 'MALAYSIAMALAYSIA': 'MALAYSIA', 'GREECEGRIECHENLAND': 'GREECE', 'MACEDONIAMAZEDONIEN': 'MACEDONIA',
                     'KUWAITKUWAIT': 'KUWAIT', 'SINGAPORESINGAPUR': 'SINGAPORE', 'QATARKATAR': 'QATAR', 'KAZAKHSTANKASACHSTAN': 'KAZAKHSTAN',
                    'JAPANJAPAN': 'JAPAN', 'SOUTHKOREAKOREASÜD': 'SOUTHKORE', 'VIETNAMVIETNAM': 'VIETNAM', 'SERBIASERBIEN': 'SERBIA',
                    'PHILIPPINESPHILIPPINEN': 'PHILIPPINES', 'NEWZEALANDNEUSEELAND': 'NEWZEALAND', 'RUSSLAND': 'RUSSIANFEDERATION', 'UA': 'UNITEDARAB_EMIRATES',
                     'TSCHECHIEN': 'CZECHREPUBLIC', 'VEREINIGTEEMIRATE': 'UNITEDARAB_EMIRATES', 'KÖNIGREICHSAUDIARABIEN': 'SAUDIARABIA', 'ESTLAND': 'ESTONIA'})

  


In [699]:
# list(set(ulist['Country'].unique()) - set(country_dict.keys()))
# sorted(list(set(country_dict.keys())- set(ulist['Country'].unique())))
print('Number of missing values in country column before=', len(ulist[ulist['Country']=='NAN']))
print('Number of missing values in country column after mapping Eng-De country names=', ulist['Country'].map(country_dict).isna().sum())
ulist.loc[(ulist['Country']=='NAN')!= ulist['Country'].map(country_dict).isna()]

Number of missing values in country column before= 1073
Number of missing values in country column after mapping Eng-De country names= 1085


Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
994,BMW Sverige Aktiebolag,bmw sverige aktiebolag,SWEDEN,191 27,sollentuna,Vetenskapsvägen 10,SE556313500201,171249.0
2158,OKG Aktiebolag,okg aktiebolag,SWEDEN,57283,oskarshamm,Simpevarp,SE556063372801,170964.0
5242,QUALITY INTERIORS & AUTOMATION SPECIALIST,quality interiors & automation specialist,TURKSCAICOSISLANDS,TKCA 1ZZ,turks caicos islands,"MORRIS PLAZA, SUITE #8",,850539.0
5896,ABB Power Grids Sweden AB,abb power grids sweden ab,SWEDEN,77180,ludvika,Lyviksvägen 3,,
6114,PT. Schaeffler Bearings Indonesia,pt. schaeffler bearings indonesia,INDONESIA,12920,jakarta,Lippo Kuningan 19th Floor Unit A & F,,
9570,Aviator,aviator,SWEDEN,19046,stockholm,Generatorgatan 11,,165160.0
10020,Brose Sweden AB,brose sweden ab,SWEDEN,42337,,Flygfältsgatan 4,,135228.0
11352,Boehringer Ingelheim Colombia,boehringer ingelheim colombia,COLOMBIA,,bogota d c,Carrera 11 N 84-09 Piso 5 Torre Sur,860000753-8,804487.0
13580,Bombardier Transportation Sweden AB,bombardier transportation sweden ab,SWEDEN,SE-721 73,västeras,Accounts Payable Västeras,,452518.0
13910,Schaeffler Philippines Inc,schaeffler philippines inc,PHILIPPINES,1229,makati city,221 Salcedo Street,,851866.0


Country names of remaining 12 rows are already in English!

In [700]:
ulist['Country'] = ulist['Country'].map(country_dict).fillna(ulist['Country'])   # De-Eng country name mapper

### 1.2 Filling Missing Country names

In [701]:
print('Country missing but city is given=', len(ulist[(ulist['Country']=='NAN') & (ulist['City']!='nan')]))
print('City is missing but country is given=', len(ulist[(ulist['Country']!='NAN') & (ulist['City']=='nan')]))
print('Both Country and city missing=', len(ulist[(ulist['Country']=='NAN') & (ulist['City']=='nan')]))

Country missing but city is given= 903
City is missing but country is given= 478
Both Country and city missing= 170


Idea is:
1. Fixing city names by searching 'city' in long strings. For example city of Eqos Energie is hinterbergerstraße 13 4400 steyr österreich ==> steyr
2. If country is missing and city is actually a country name ==> city=country
3. If country missing but city given ==> use mapping dictionary
4. If country still missing but city is given ==> Most common country grouped by city names
5. If city is missing but country is given ==> Take country name as city or (fill with most common city grouped by country)
6. If both Country and city missing ==> ? Take from above entry or ignore the entry?

In [702]:
country_citydf = pd.read_csv('worldcities.csv', usecols=['country', 'city_ascii'])   #https://simplemaps.com/data/world-cities
country_citydf.head()

country_city_dict = {k: g["city_ascii"].tolist() for k,g in country_citydf.groupby("country")}
country_city_dict = {k.upper():v for k,v in country_city_dict.items()}

for i in country_city_dict.keys():            # values (city names) lower case and special characters are removed
    if type(country_city_dict[i]) is list:
        country_city_dict[i]=  [j.lower().replace(' ','').replace('-','') for j in country_city_dict[i]]
    else:
        country_city_dict[i] = country_city_dict[i].lower().replace(' ','').replace('-','')

country_city_dict = {k.replace(' ','').replace('-',''): v for k, v in country_city_dict.items()}  # remove white space and 
geo_countries = list(country_city_dict.keys())
geo_cities = [j for sub in list(country_city_dict.values()) for j in sub]

### Cases

In [703]:
ulist_cases = ulist[(ulist['Country']=='NAN') & (ulist['City']!='nan')]    # all the entries with missing country
ulist_cases

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
41,SERAG-WIESSNER GmbH & Co. KG,serag-wiessner gmbh & co. kg,NAN,95119,naila,Zum Kugelfang 8-12,,
74,Schreinerei Löhle GmbH,schreinerei löhle gmbh,NAN,78567,fridingen,Zollerstr. 2,,851897
99,LAIPPLE / BRINKMANN GMBH,laipple / brinkmann gmbh,NAN,73099,adelberg,Ziegelhau 13,,850189
110,"Fraport Slovenija, d.o.o","fraport slovenija, d.o.o",NAN,4210,brnik aerodrom,Zg. Brnik 130a,SI12574856,804393
112,Amtsgericht Ulm,amtsgericht ulm,NAN,89073,ulm,Zeughausgasse 14,,
...,...,...,...,...,...,...,...,...
14360,d & b Bau GmbH,d & b bau gmbh,NAN,,neustadt wstr neu isenburg,,,852329
14361,Stadt Rheine / Feuerwehr,stadt rheine / feuerwehr,NAN,,rheine,,,850429
14363,VIESGO INFRAESTRUCTURAS ENERGETICAS S.L,viesgo infraestructuras energeticas s.l,NAN,,spain,,,850166
14364,Eqos,eqos,NAN,,steyr,,,850713


In [704]:
# Examples
# Case 1
print('Case 1. City name error:\n\n', ulist[ulist['Companyname_clean'].str.startswith('eqos')])
print('\n{} entries are longer than 2 tokens'.format(len([x for x in ulist['City'].str.split() if len(x)>=3])))

# Case 2
print('\n\nCase 2. City name is actually a country name\n', ulist_cases[ulist_cases['City'].str.upper().isin(geo_countries)].tail(5))    # spain ==> SPAIN

# Case 3
print('\n\nCase 3. Find country from city name dictionary:\n', ulist_cases[ulist_cases['City'].isin(geo_cities)].head())   # naila ==> GERMANY

# Case 4
print('\n\nCase 4. Map most common country grouped by city name\n', ulist[ulist['City']=='valencia']['Country'].value_counts().sort_values(ascending=False))
print('\nExample 2 Moskau:', ulist[ulist['City']=='moskau']['Country'].value_counts().sort_values(ascending=False))
      #adelberg, senningerberg, moskau and valencia, haag)

Case 1. City name error:

        Company_name Companyname_clean Country Zipcode  \
14351  Eqos Energie      eqos energie     NAN     NaN   
14364          Eqos              eqos     NAN     NaN   

                                                City Address USt.-IdNr.  \
14351  hinterbergerstraße 13  4400 steyr  österreich     NaN        NaN   
14364                                          steyr     NaN        NaN   

           ID  
14351  850821  
14364  850713  

523 entries are longer than 2 tokens


Case 2. City name is actually a country name
                                      Company_name  \
13692            ams-OSRAM Asia Pacific Pte. Ltd.   
13791  TTS Tooltechnic Systems North  America, LP   
13858      Boehringer Ingelheim Singapore Pte Ltd   
13989            Schaeffler (Singapore) Pte. Ltd.   
14363     VIESGO INFRAESTRUCTURAS ENERGETICAS S.L   

                                Companyname_clean Country   Zipcode  \
13692            ams-osram asia pacific pte. ltd.  

In [705]:
[k for k, v in country_city_dict.items() if 'valencia' in v]

['COLOMBIA', 'ECUADOR', 'PHILIPPINES', 'SPAIN', 'VENEZUELA']

In [706]:
a = ulist.groupby('City').Country.nunique() > 2     # cities appear in more than a country
a[a].index.tolist()                    # schwarzach, burgdorf

# https://stackoverflow.com/questions/54518504/check-if-group-contains-same-value-in-pandas

['burgdorf',
 'deurne',
 'linz',
 'london',
 'nan',
 'neunkirchen',
 'puebla',
 'schwarzach']

In [707]:
print('Among 903 entries {} of their cities is actually a country from the dictionary'.format(len(ulist_cases[ulist_cases['City'].str.upper().isin(geo_countries)])))
print('Among 903 entries {} of their cities matches with a country from the dictionary'.format(len(ulist_cases[ulist_cases['City'].isin(geo_cities)])))

Among 903 entries 8 of their cities is actually a country from the dictionary
Among 903 entries 459 of their cities matches with a country from the dictionary


In [708]:
# Case 1
def city_finder(city):
    if len(city.split())>=3:
        cityname = [x for x in city.split() if x in geo_cities]   # city is actually address when city string >= 3 tokens
        result = ''.join(cityname)
        if result!='':
            return result
        else:
            return city
    return city

ulist['City'] = ulist['City'].apply(city_finder)


# Case 2
ulist["City"] = ulist["City"].str.replace('\W', '')   #previously tokenized by spaces and now spaces are removed

case2_index = ulist_cases[ulist_cases['City'].str.upper().isin(geo_countries)].index

def country_filler1(city, country):
    country_name = [k for k, v in country_city_dict.items() if (city.upper() in k) & (city!='')]   # if city is actually a country and not a missing entry
    result = city.upper()
    if result != '':
        return result
    else:
        return country
    
ulist.loc[case2_index, 'Country'] =  ulist.loc[case2_index].apply(lambda x: country_filler1(x.City, x.Country), axis=1)   # gonna mask these and continue with next cases


# Case 3
case3_index = ulist_cases[ulist_cases['City'].isin(geo_cities)].index

def country_filler2(city, country):
    country_name = [k for k, v in country_city_dict.items() if city in v]   # country of the city
    result = ''.join(country_name)
    if result in country_city_dict.keys():
        return result
    else:
        return country
    
ulist.loc[case3_index, 'Country'] = ulist.loc[case3_index].apply(lambda x: country_filler2(x.City, x.Country), axis=1)


# Case 4
ulist['City'] = ulist['City'].replace('nan', np.nan)
ulist['Country'] = ulist['Country'].replace('NAN', np.NaN)
case4_index = ulist[ulist['Country'].isna()].index

s = ulist.groupby(['City','Country']).size()
df = s.loc[s.groupby(level=0).idxmax()].reset_index().drop(0,axis=1)
city_country_filler = dict(df.values)

ulist.loc[case4_index, 'Country'] = ulist.loc[case4_index, 'City'].map(city_country_filler)
    
# https://stackoverflow.com/questions/66724197/get-key-from-a-value-where-value-is-in-a-list

In [709]:
print('Remanining lenght of missing countries when city is available=', len(ulist[(ulist['Country'].isna()) & (ulist['City'].notnull())]))   # from 903

Remanining lenght of missing countries when city is available= 260


In [710]:
ulist[ulist['Companyname_clean'].str.startswith('eqos')]   # case 1

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
14351,Eqos Energie,eqos energie,AUSTRIA,,steyr,,,850821
14364,Eqos,eqos,AUSTRIA,,steyr,,,850713


In [711]:
ulist.loc[case2_index].tail(5)    # case 2

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
13692,ams-OSRAM Asia Pacific Pte. Ltd.,ams-osram asia pacific pte. ltd.,SINGAPORE,569877,singapore,7000 Ang Mo Kio Ave 5,,290745
13791,"TTS Tooltechnic Systems North America, LP","tts tooltechnic systems north america, lp",UNITEDSTATES,IN 46052,lebanon,400 N. Enterprise Blvd.,,804190
13858,Boehringer Ingelheim Singapore Pte Ltd,boehringer ingelheim singapore pte ltd,SINGAPORE,199555,singapore,300 Beach Road,,802212
13989,Schaeffler (Singapore) Pte. Ltd.,schaeffler (singapore) pte. ltd.,SINGAPORE,556741,singapore,151 Lorong Chuan #06-01,,800092
14363,VIESGO INFRAESTRUCTURAS ENERGETICAS S.L,viesgo infraestructuras energeticas s.l,SPAIN,,spain,,,850166


In [712]:
ulist.loc[case3_index].head()    # case 3

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
41,SERAG-WIESSNER GmbH & Co. KG,serag-wiessner gmbh & co. kg,GERMANY,95119,naila,Zum Kugelfang 8-12,,
112,Amtsgericht Ulm,amtsgericht ulm,GERMANY,89073,ulm,Zeughausgasse 14,,
148,ALUTEC SYSTEM-TECHNIK GMBH,alutec system-technik gmbh,GERMANY,74385,pleidelsheim,Zeppelinstr. 11,,851837.0
230,Amtsgericht Bad Waldsee,amtsgericht bad waldsee,GERMANY,88339,waldsee,Wurzacher Str. 73,,
253,Aircraft Cabin Modification GmbH,aircraft cabin modification gmbh,GERMANY,87700,memmingen,Woringer Straße 11,,851303.0


In [713]:
ulist[ulist['City']=='moskau']    # case 4

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
1198,Aeroflot,aeroflot,RUSSIANFEDERATION,119002.0,moskau,ul. Arbat 10,,803708
4392,Hyperglobus Globus Hypermarkets,hyperglobus globus hypermarkets,RUSSIANFEDERATION,125493.0,moskau,Personalabteilung,7743543761.0,160024
7143,Elemash,elemash,RUSSIANFEDERATION,,moskau,Karl-Marx-Street,,851849
13946,"JSC ""Vnukova Airport""","jsc ""vnukova airport""",RUSSIANFEDERATION,119027.0,moskau,"1st, Reysovaya Str.",,800439
14359,DME - Flughafen Domodedevo,dme - flughafen domodedevo,RUSSIANFEDERATION,,moskau,,,803710


In [714]:
##Notes
# Can be more standardized
#list(set(ulist['Country'].unique()) -set(geo_countries))
#list(set(geo_countries)- set(ulist['Country'].unique()))

# Buggy
#ulist.loc[a[a['City'].str.upper().isin(geo_countries)].index]
#ulist[ulist['City']=='']


# Entries to take care of:
#ulist[ulist['Companyname_clean']=='brose north america']
#ulist[ulist['Companyname_clean'].str.startswith('fraport')]
#ulist[ulist['Companyname_clean'].str.startswith('schlage')]


# ulist[ulist['Company_name'].str.endswith(tuple(suffix_list))] some finishes with gmbh some -gmbh and some even continues

## 1.3 Dataset Standardization
**Prefix, Middle, Suffix removal. Column Standardization**

**Aim is to remove suffix and rest** ==> E.g: Sensient Imaging Technologies GmbH Chemiepark Bitterfeld-Wolfen

In [715]:
ulist = ulist.assign(Idtrack=lambda ulist: range(1, len(ulist)+1))
ulist_original = ulist.copy()

In [716]:
# Suffix list for company names   (ordering is important! Will be fixed)
suffix_list = ['gmbh', 'gmbh&co.', 'ggmbh', 'gmbh+co.', 'kg-gmbh', 'gmbh&co', 'ltd.', 'se',
      'gmbh&co.kg', 'gmbh&cokg', 'gmbh.', 'gmbh,', 'gmbh&c', '(gmbh', 'mbh',
               'company', 'incorporated', 'corporation', 'corp.', 'corp', 'inc',
      '& co.', '& co',  'inc.', 's.p.a.', 'n.v.', 'a.g.', 'ag', 'nuf', 's.a.', 's.f.',
      'oao', 'co.', 'co',
              'soc.col.', 'stg', 'd.n.o.', 'ltda.', 'v.o.s.', 'a spol.',
      u've\xc5\x99. obch. spol.', 'kgaa', 'o.e.', 's.f.', 's.n.c.', 's.a.p.a.', 'j.t.d.',
      'v.o.f.', 'sp.j.', 'og', 'sd', ' i/s', 'ay', 'snc', 'oe', 'bt.', 's.s.', 'mb',
      'ans', 'da', 'o.d.', 'hb', 'pt',
              'unltd', 'ultd', 'sal', 'unlimited', 'saog', 'saoc', 'aj',
      'yoaj', 'oaj', 'akc. spol.', 'a.s.',
              'esv', 'gie', 'kv.', 'qk',
              'pty. ltd.', 'pty ltd', 'ltd', 'l.t.d.', 'bvba', 'd.o.o.', 'ltda', 'gmbh',
      'g.m.b.h', 'kft.', 'kht.', 'zrt.', 'ehf.', 's.a.r.l.', 'd.o.o.e.l.', 's. de r.l.',
      'b.v.', 'tapui',
      'sp. z.o.o.', 'sp. z o.o.', 'spółka z o.o.',
      's.r.l.', 's.l.', 's.l.n.e.', 'ood', 'oy', 'rt.',
      'teo', 'uab', 'scs', 'sprl', 'limited', 'bhd.', 'sdn. bhd.', 'sdn bhd', 'as',
      'lda.', 'tov', 'pp',
              'pllc', 'llc', 'l.l.c.', 'plc.', 'plc', 'hf.', 'oyj',
      'a.e.', 'nyrt.', 'p.l.c.', 'sh.a.', 's.a.', 's.r.l.', 'srl.', 'srl', 'aat', '3at', 'd.d.',
      's.r.o.', 'spol. s r.o.', 's.m.b.a.', 'smba', 'sarl', 'nv', 'sa', 'aps',
      'a/s', 'p/s', 'sae', 'sasu', 'eurl', 'ae', 'cpt', 'as', 'ab', 'asa', 'ooo', 'dat',
      'vat', 'zat', 'mchj', 'a.d.',
              'lllp', 'l.l.l.p.',
              'llp', 'l.l.p.', 'sp.p.', 's.c.a.', 's.c.s.',
              'gmbh & co. kg', 'lp', 'l.p.', 's.c.s.',
      's.c.p.a', 'comm.v', 'k.d.', 'k.d.a.', 's. en c.', 'e.e.', 's.a.s.', 's. en c.',
      'c.v.', 's.k.a.', 'sp.k.', 's.cra.', 'ky', 'scs', 'kg', 'kd', 'k/s', 'ee', 'secs',
      'kda', 'ks', 'kb','kt',
              'sicav',
              'nl',
              'vzw', 'ses.', 'gte.',
              'private', 'pte', 'xk',
              'p.c.', 'vof', 'snc',
              'pllc', 'p.l.l.c.',
              'e.u.', 's.p.', 't:mi', 'tmi', 'e.v.', 'e.c.', 'et', 'obrt',
      'fie', 'ij', 'fop', 'xt']

# Source of company suffices https://github.com/psolin/cleanco/blob/master/cleanco/termdata.py

In [717]:
def suffix_remover(companyname):    # Running multiple times (2 is good) removes suffix effectively
    for i in suffix_list:
        #if i in re.split(' |-', companyname):   # doesn't work properly
        if i == companyname.split()[0]:    # tokenization
            return ' '.join(companyname.split()[companyname.split().index(i)+1:])    # If first index is prefix then take rest of the string
        elif i in companyname.split():
            return ' '.join(companyname.split()[:companyname.split().index(i)])    # suffix and rest are removed
            #return companyname.split()[:companyname.split().index(i)]

        elif i in companyname.split('-'):
            return ' '.join(companyname.split('-')[:companyname.split('-').index(i)])
            #return [x.replace(' ', '') for x in companyname.split('-')[:companyname.split('-').index(i)]]
    return companyname

ulist['Companyname_clean'] = ulist['Companyname_clean'].apply(suffix_remover)
ulist['Companyname_clean'].head(10)

0                      bistum essen
1        kbs kleider bauer betriebs
2                  elektro ing-plan
3               netzdesign-vobornik
4    niles-simmons industrieanlagen
5                    gemac chemnitz
6                            emdion
7         fiege logistik stiftung &
8              william prym holding
9                      leoni kerpen
Name: Companyname_clean, dtype: object

In [718]:
# Bug fixed
companyname='volkswagen financial services ag'
i='ag'
if i in companyname.split():
    print(companyname[:companyname.index(i)-1])
    
if i in companyname.split():
    print(' '.join(companyname.split()[:companyname.split().index(i)]))

volks
volkswagen financial services


In [719]:
# Prefix updated
companyname='as watson'
pref=['as', 'gmbh', 'ds']
  
for i in pref:
    if i == companyname.split()[0]:
        print(' '.join(companyname.split()[companyname.split().index(i)+1:]))
    elif i in companyname.split():
        print(' '.join(companyname.split()[:companyname.split().index(i)]))

watson


In [720]:
ulist[ulist['Companyname_clean'] == '']

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Idtrack


In [721]:
ulist[ulist['Company_name'] == 'AS Watson']

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Idtrack
4195,AS Watson,watson,NETHERLANDS,6666 LT,heteren,Poort van Midden Gelderland Rood 24,,331187,4196
4929,AS Watson,watson,NETHERLANDS,3972,darenswoude,Nijborg 17,,803522,4930


## Remaining suffix problem

In [722]:
print('Remaining suffix as gmbh (most common):', len(ulist[ulist['Companyname_clean'].str.contains('gmbh')]))

Remaining suffix as gmbh (most common): 41


In [723]:
ulist.iloc[564]

Company_name         BOSCH SICHERHEITSSYSTEME GMB H
Companyname_clean    bosch sicherheitssysteme gmb h
Country                                     GERMANY
Zipcode                                       85630
City                                      grasbrunn
Address                  WERNER-VON-SIEMENS-RING 10
USt.-IdNr.                              DE813474672
ID                                           902003
Idtrack                                         565
Name: 564, dtype: object

In [724]:
ulist["Companyname_clean"] = ulist["Companyname_clean"].str.replace('\W', '')      # special characters are removed, white space etc.

In [725]:
# Remove some prefix, middle, suffices from all companynames (clear typos: operational servicesGmbH & Co. KG)
remove_list = ['gmbh']   # stopwords approach, what about stiftung, holding etc.

def pre_middle_suffix_remove(companyname):   # appylying this function after special characters (+ white spaces) are removed
    for i in remove_list:
        if i in companyname:
            return companyname.replace(i, '')
    return companyname

ulist['Companyname_clean'] = ulist['Companyname_clean'].apply(pre_middle_suffix_remove)

In [726]:
# List of gmbh contained entries: [x for x in ulist['Companyname_clean'] if any(tag in x for tag in remove_list)]
print('Original Companyname:', ulist.loc[554, 'Company_name'])   # An Example with GmbH
print('Cleaned Companyname:', ulist.loc[554, 'Companyname_clean'])
print('Number of gmbh left:', len(ulist[ulist['Companyname_clean'].str.contains('gmbh')]))

Original Companyname: Technimark-Eisbär GmbHKunststoff- und Metallverarb.
Cleaned Companyname: technimarkeisbärkunststoffundmetallverarb
Number of gmbh left: 0


In [727]:
# a = list(set(ulist[ulist.loc[:,['Companyname_clean', 'Country', 'City']].duplicated()].index) - set(ulist[ulist.loc[:,['Companyname_clean', 'Country', 'Zipcode', 'City']].duplicated()].index))
# a.sort()
# ulist.loc[a]

In [728]:
#ulist[ulist['Companyname_clean'].str.startswith('freieund')]
#ulist[ulist['Companyname_clean'].str.startswith('freieundhansestadthamburg')]

## 2. Duplicates and Grouping them together

In [729]:
# moved NA of 'USt.-IdNr.' and 'ID' columns to bottom of df so that rows with more NA's will be removed because they are at the bottom.
sorted_duplicates = ulist.sort_values(by=["USt.-IdNr.", "ID"], na_position='last')      # move NAs to the bottom of the df

In [730]:
ulist.groupby(['Companyname_clean', 'Country', 'City']).size().sort_values(ascending=False)[:516]  # 515 companies have multiple entries

Companyname_clean                              Country  City      
freieundhansestadthamburg                      GERMANY  hamburg       6
hiltideutschland                               GERMANY  kaufering     5
fraport                                        GERMANY  frankfurt     5
landeshauptstadtstuttgart                      GERMANY  stuttgart     5
interflexdatensysteme                          GERMANY  stuttgart     4
                                                                     ..
zeagimmobilien                                 GERMANY  heilbronn     2
gebäudewirtschafttrier                         GERMANY  trier         2
jenaernahverkehr                               GERMANY  jena          2
bombardiertransportationsharedservicesromania  ROMANIA  clujnapoca    2
admedes                                        GERMANY  pforzheim     2
Length: 516, dtype: int64

In [731]:
ulist[ulist['Companyname_clean'].str.startswith('aurubis')]

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Idtrack
717,AURUBIS BELGIUM,aurubisbelgium,BELGIUM,2250,olen,WATERTORENSTRAAT 35,,320482.0,718
8016,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestrasse 50,,,8017
8017,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestr. 50,DE118514155,803007.0,8018
8018,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestr. 50,DE118514155,160616.0,8019


In [732]:
ulist['Address'] = ulist.sort_values(by=['Address'],na_position='last').groupby('Companyname_clean')['Address'].transform('first')

In [734]:
ulist[ulist['Companyname_clean'].str.startswith('aurubis')]

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Idtrack
717,AURUBIS BELGIUM,aurubisbelgium,BELGIUM,2250,olen,WATERTORENSTRAAT 35,,320482.0,718
8016,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestr. 50,,,8017
8017,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestr. 50,DE118514155,803007.0,8018
8018,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestr. 50,DE118514155,160616.0,8019


In [49]:
# manytoone_dict=ulist.groupby(['Companyname_clean', 'Country', 'City'], dropna=False).apply(lambda x: x['Company_name'].tolist()).to_dict()
#dropna=False doesn't work in my pandas version. So, another approach==>

In [55]:
ulist.isna().sum()

Company_name            0
Companyname_clean       0
Country               430
Zipcode               312
City                  648
Address               439
USt.-IdNr.           9701
ID                   3677
Idtrack                 0
dtype: int64

In [127]:
# sort for address na last ==> try print address in many to one dictionary

In [128]:
ulist[ulist['Companyname_clean'].str.startswith('fraport')]

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Idtrack
110,"Fraport Slovenija, d.o.o",fraportslovenijadoo,,4210,brnikaerodrom,Zg. Brnik 130a,SI12574856,804393,111
250,Fraport Ground Services Austria GmbH,fraportgroundservicesaustria,AUSTRIA,1300,wien,World Trade Center,ATU47049206,160810,251
10045,FRAPORT AG,fraport,GERMANY,60547,frankfurt,Flughafen,DE114150623,160783,10046
10047,FRAPORT AG,fraport,GERMANY,60547,frankfurt,Flughafen,DE114150623,803054,10048
13513,Fraport AG ASM-LZ,fraport,GERMANY,60547,frankfurt,Airport Security Management,,160160,13514
14101,Fraport AG IFM-FG31,fraport,GERMANY,60547,frankfurt,,,161023,14102
14102,Fraport AG IFM-FI21,fraport,GERMANY,60547,frankfurt,,,160018,14103


In [758]:
cleanname_values = ulist.assign(City=lambda ulista: ulista.City.fillna("-")).assign(Country=lambda ulista: ulista.Country.fillna("-")).groupby(
    ['Companyname_clean', 'Country', 'City'])[['Companyname_clean',  'Country', 'City']].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

originalname_values = ulist.assign(City=lambda ulista: ulista.City.fillna("-")).assign(Country=lambda ulista: ulista.Country.fillna("-")).groupby(
    ['Companyname_clean', 'Country', 'City'])[['Company_name',  'Country', 'City']].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

#keys are simplified company names and values are originals+simplified

In [800]:
cleanname_values = ulist.assign(City=lambda ulista: ulista.City.fillna("-")).assign(Country=lambda ulista: ulista.Country.fillna("-")).assign(
    Address=lambda ulista: ulista.Address.fillna("-")).groupby(
    ['Companyname_clean', 'Country', 'City', 'Address'])[['Companyname_clean',  'Country', 'City']].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

originalname_values = ulist.assign(City=lambda ulista: ulista.City.fillna("-")).assign(Country=lambda ulista: ulista.Country.fillna("-")).assign(
    Address=lambda ulista: ulista.Address.fillna("-")).groupby(
    ['Companyname_clean', 'Country', 'City', 'Address'])[['Company_name',  'Country', 'City']].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

#keys are simplified company names and values are originals+simplified

In [801]:
{k: originalname_values[k] for k in list(originalname_values)[4859:4860]}

{('fraport',
  'GERMANY',
  'frankfurt',
  'Airport Security Management'): [('FRAPORT AG  ',
   'GERMANY',
   'frankfurt'), ('FRAPORT AG', 'GERMANY', 'frankfurt'), ('Fraport AG ASM-LZ',
   'GERMANY',
   'frankfurt'), ('Fraport AG IFM-FG31',
   'GERMANY',
   'frankfurt'), ('Fraport AG IFM-FI21', 'GERMANY', 'frankfurt')]}

In [802]:
total = {}
for x in originalname_values.keys():
    if x in originalname_values.keys():
        total[x] = originalname_values[x] + cleanname_values[x]

In [803]:
len(total)

13938

In [804]:
total[('fraport', 'GERMANY', 'frankfurt', 'Airport Security Management')]

[('FRAPORT AG  ', 'GERMANY', 'frankfurt'),
 ('FRAPORT AG', 'GERMANY', 'frankfurt'),
 ('Fraport AG ASM-LZ', 'GERMANY', 'frankfurt'),
 ('Fraport AG IFM-FG31', 'GERMANY', 'frankfurt'),
 ('Fraport AG IFM-FI21', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt')]

In [805]:
[k for k, v in total.items() if ('FRAPORT AG', 'GERMANY', 'frankfurt') in v]

[('fraport', 'GERMANY', 'frankfurt', 'Airport Security Management')]

## Many-to-one company finding Function

In [806]:
for i in total.keys():
    if type(total[i]) is list:
        total[i] = [(a.lower(),b, c) for a,b,c in total[i]]    # Converts company names all in lower case

In [807]:
(total[('fraport', 'GERMANY', 'frankfurt', 'Airport Security Management')])

[('fraport ag  ', 'GERMANY', 'frankfurt'),
 ('fraport ag', 'GERMANY', 'frankfurt'),
 ('fraport ag asm-lz', 'GERMANY', 'frankfurt'),
 ('fraport ag ifm-fg31', 'GERMANY', 'frankfurt'),
 ('fraport ag ifm-fi21', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt'),
 ('fraport', 'GERMANY', 'frankfurt')]

In [808]:
total = {k: set(v) for k,v in total.items()}   # Duplicated values in the keys are removed

In [809]:
(total[('fraport', 'GERMANY', 'frankfurt', 'Airport Security Management')])

{('fraport', 'GERMANY', 'frankfurt'),
 ('fraport ag', 'GERMANY', 'frankfurt'),
 ('fraport ag  ', 'GERMANY', 'frankfurt'),
 ('fraport ag asm-lz', 'GERMANY', 'frankfurt'),
 ('fraport ag ifm-fg31', 'GERMANY', 'frankfurt'),
 ('fraport ag ifm-fi21', 'GERMANY', 'frankfurt')}

In [810]:
companyname='volkswagen financial services aktiengesellschaft'

suffix_specific = {'aktiengesellschaft': 'ag', 'gesellschaft mit beschränkter haftung': 'gmbh'}
# add https://en.wikipedia.org/wiki/List_of_legal_entity_types_by_country

company_specific = {'volkswagen': 'vw'}
mapper = {**suffix_specific, **company_specific}
#mapper = {'aktiengesellschaft': 'ag', 'gesellschaft mit beschränkter haftung': 'gmbh'}

for i in mapper.keys():
    if i in companyname.split():
        print(companyname.replace(i, mapper[i]))

volkswagen financial services ag
vw financial services aktiengesellschaft


In [811]:
# improving
def find_comp(x,y,z):
    x= x.lower()
    y= y.upper()
    z= z.lower()
    result = [k for k, v in total.items() if (x, y, z) in v]            # exact match
    result_alt1 = [k for k, v in total.items() if (x, y, '-') in v]     # in case city is blank
    result_alt2 = [k for k, v in total.items() if (x, '-', z) in v]     # in case country is blank
    result_alt3 = [k for k, v in total.items() if (x.replace(' ',''), y, z) in v]   # remove whitespace (captures most entries without their suffix)
    result_close = [k for k, v in total.items() if (' '.join(x.split()[:-1]), y, z) in v] # less than what you entered recommendation: input Fraport AG 2 ==> finds Fraport AG
    
    if result != []:
        return result
    elif result_alt1 != []:
        return result_alt1
    elif result_alt2 != []:
        return result_alt2
    elif result_alt3 != []:
        return result_alt3
    elif result_close != []:
        return "Are you looking for the company named *{}*?".format(' '.join(x.split()[:-1]))  #E.g: find_comp('Fraport ag example', 'Germany', 'frankfurt')
# if no result until here;        
    for i in mapper.keys():     # Manuel rules here: Aktiengesellschaft=AG
        if i in x.split():      #E.g: find_comp('Volkswagen Financial Services aktiengesellschaft', 'Germany', 'braunschweig')
            #x = x.replace(i, mapper[i])
            result_mapped1 = [k for k, v in total.items() if (x.replace(i, mapper[i]), y, z) in v]      # exact match after mapping manuel rules
            result_mapped2 = [k for k, v in total.items() if (x.replace(i, mapper[i]), y, '-') in v]
            result_mapped3 = [k for k, v in total.items() if (x.replace(i, mapper[i]), '-', z) in v]
            result_close2 =  [k for k, v in total.items() if (' '.join(x.replace(i, mapper[i]).split()[:-1]), y, z) in v]   # for the recommendation part
            if result_mapped1 != []:
                return result_mapped1
            elif result_mapped2 != []:
                return result_mapped2
            elif result_mapped3 != []:
                return result_mapped3
            elif result_close2!=[]:
                return "Are you looking for the company named *{}*?".format(' '.join(x.replace(i, mapper[i]).split()[:-1]))
        
    else:
        # Similarity measure later
        return 'Not available'

In [812]:
print(find_comp('1Mustermann AG1', 'Germany', '-'))
print(find_comp('1Mustermann AG1', 'Germany', 'Frankfurt'))   # Not in frankfurt but without a city entry

print(find_comp('FRAPORT AG', 'Germany', 'Frankfurt'))
print(find_comp('FRAPORT AG', 'Germany', '-'))

# ulist[(ulist['Country'].isna())&(ulist['City'].notna())]
find_comp('Daikin Europe n.v', 'Germany', 'oostende')

[('1mustermannag1', 'GERMANY', '-', 'Musterstr. 7')]
[('1mustermannag1', 'GERMANY', '-', 'Musterstr. 7')]
[('fraport', 'GERMANY', 'frankfurt', 'Airport Security Management')]
Not available


[('daikineuropenv', '-', 'oostende', 'Zandvoordestraat 300')]

In [813]:
total[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', 'IH-IMC')]

{('volkswagen financial services ag', 'GERMANY', 'braunschweig'),
 ('volkswagenfinancialservices', 'GERMANY', 'braunschweig')}

In [814]:
print(find_comp('Volkswagen Financial Services aktiengesellschaft', 'Germany', 'braunschweig'))   # AG = Aktiengesellschaft
print(find_comp('Volkswagen Financial Services ag 2', 'Germany', 'braunschweig'))
print(find_comp('Volkswagen Financial Services aktiengesellschaft 2', 'Germany', 'braunschweig'))
print(find_comp('Volkswagen Financial Services', 'Germany', 'braunschweig'))   # 3 options: remove whitespace or suffix addition or contained in 'Volkswagen Financial Services ag'
print(find_comp('VolkswagenFinancialServices', 'Germany', 'braunschweig'))
print(find_comp('VolkswagenFinancialService', 'Germany', 'braunschweig'))   # similary measure later

[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', 'IH-IMC')]
Are you looking for the company named *volkswagen financial services ag*?
Are you looking for the company named *volkswagen financial services ag*?
[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', 'IH-IMC')]
[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', 'IH-IMC')]
Not available


In [815]:
print(find_comp('Fraport ag example', 'Germany', 'frankfurt'))   # Fraport AG available

Are you looking for the company named *fraport ag*?


**Next steps**
- Dictionary will be tested more. Especially for blank entries. Working on it... with testing on specific entries like: Volkswagen Financial Services aktiengesellschaft
- Some company names have white space and special characters. Will search a way to make input for any case more findable. Better
- Dictionary has duplicated values for some keys and they will be removed for efficiency. Done
- There can be well written company name and unique address to be outputed.
- Function will be improved. Printed output, more rules in case of not matched entry (Volkswagen AG = Volkswagen Aktiengesellschaft). Also, need update for input with suffix. Working on it...

Idea is grouping duplicated entries with a unique company name. Then applying same methods to companies from other data sets. First, removing suffices and then searching duplicated companies in company name, country and city columns. My goal is to finding duplicates so that I can group them under an unique company name. After that, I will append remaining unique company names to a final dataframe or a dictionary. Later, I can come back and group the duplicated ones for each unique company and create a many-to-one mapping function.

At the end, inputing a company name will return a unique company name (+information too like country, city, revenue?) based on grouping same companies with varying names and a rule-based approach.

For example:
Input (Volkswagen) ==> Volkswagen AG   based on suffix (adding suffix and searching)

Input (Volkswagen Aktiengesellschaft) ==> Volkswagen AG    (based on grouping same companies)

Input (VW) ==> Volkswagen AG   based on a rule

In [27]:
ulist[ulist["Companyname_clean"].str.startswith('rweiss')]   # Actually both belongs to R.WEISS Packaging GmbH & Co. KG

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
24,R. Weiss Maschinenbau GmbH,rweissmaschinenbau,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28-30,DE190976983,181817,26
25,R. Weiss Automation GmbH & Co. KG,rweissautomation,DEUTSCHLAND,74564,crailsheim,ZUR FLÜGELAU 28,,803402,28


In [28]:
# sintmaartenskliniek   # city different but everything else same

In [29]:
# ulist.iloc[7659]   # general example

In [30]:
ulist[ulist['Company_name'].str.contains('Volkswagen')]    # volksw'ag'en 'ag' case "ag" now catches correct ag (suffix one)

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
212,Volkswagen Group United kingdom Limited,volkswagengroupunitedkingdom,GROSSBRITANNIEN,MK14 5AN,miltonkeynes,Yeomans Drive,,805690.0,221
741,Volkswagen Automobile Stuttgart GmbH,volkswagenautomobilestuttgart,DEUTSCHLAND,70188,stuttgart,Wangener Str. 66,,,768
969,Volkswagen Original Teile Logistik GmbH & Co. KG,volkswagenoriginalteilelogistik,DEUTSCHLAND,34225,baunatal,Vertriebszentrum West,DE230960046,803202.0,1004
1060,Volkswagen Original Teile LogiSüdwest / Franke...,volkswagenoriginalteilelogisüdwestfranken,DEUTSCHLAND,97084,,Unterer Kirchbergweg 65,,170970.0,1097
1087,Volkswagen Infotainment GmbH,volkswageninfotainment,DEUTSCHLAND,44799,bochum,Universitätsstraße 140,DE295504619,803983.0,1126
2476,Volkswagen Business Services GmbH I-SEC; Frau ...,volkswagenbusinessservices,DEUTSCHLAND,38112,braunschweig,Schmalbachstraße 1,DE171252317,160936.0,2567
2478,Volkswagen Financial Services Digital Solution...,volkswagenfinancialservicesdigitalsolutions,DEUTSCHLAND,38112,braunschweig,Schmalbachstraße 1,DE260043656,800056.0,2569
3736,Volkswagen Immobilien GmbH,volkswagenimmobilien,DEUTSCHLAND,38440,wolfsburg,Poststr. 28,,,3883
6833,Volkswagen Osnabrück GmbH,volkswagenosnabrück,DEUTSCHLAND,49084,osnabrück,KARMANNSTRAßE 1,,803445.0,7095
7652,Volkswagen Financial Services AG,volkswagenfinancialservices,DEUTSCHLAND,38112,braunschweig,IH-IMC,DE811115544,803041.0,7942


**Think/ask Aktiengesellschaft == AG rule based or grouped same companies**

In [31]:
ulist.head()

Unnamed: 0,Firmenname,Firmenname_clean,Land,Postleitzahl,Ort,Straße und Hausnr.,USt.-IdNr.,ID,Idtrack
0,Bistum Essen,bistumessen,DEUTSCHLAND,45127,essen,Zwölfling 16,,801695.0,1
1,KBS Kleider Bauer Betriebs-GmbH,kbskleiderbauerbetriebs,ÖSTERREICH,2380,perchtoldsdorf,Zwingenstr. 5,,850801.0,2
2,Elektro Ing-Plan GmbH Dresden,elektroingplan,DEUTSCHLAND,1187,dresden,Zwickauer Straße 88,,851014.0,3
3,Netzdesign-Vobornik,netzdesignvobornik,DEUTSCHLAND,71083,herrenberg,Zwickauer Strasse 41,,,4
4,NILES-SIMMONS Industrieanlagen GmbH,nilessimmonsindustrieanlagen,DEUTSCHLAND,9117,chemnitz,Zwickauer Straße 355,DE140853999,802541.0,5


In [32]:
# input 2-5
# output 1st column add

# Next Steps
1. Working on Prefix:
      There are ' ' strings (if suffix in the first index of a string get the rest of that string [companyname.split().index(i):]) Done!
2. Converting column names and Land (Country) column to English Done!
3. Filling missing city-country pairs so that same companies can be grouped more effectively. Done (can be improved)
3. Creating a dictionary to map many to one (groupby or duplicates) Done but might require some modifications.
4. Function takes input and returns a unique company name and its' info   Working on it...
5. Other datasets