In [1]:
import pandas as pd
import numpy as np
import yaml
import json
import re
from difflib import SequenceMatcher    # https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
# !pip install cleanco
# import cleanco   # can be useful

In [2]:
def columns_preprocess(df):
    df['Company_name'] = df['Company_name'].str.strip()   # remove whitespace before and after strings
    df = df.astype({"Country": str, "City": str})
    df["Country"] = df["Country"].str.upper().str.strip().astype('category')
    df["City"] = df["City"].str.lower().str.strip().str.replace('\W', ' ')    # lowercase cities and tokenize
    df['Zipcode'] = df['Zipcode'].replace(r'^\s*$', np.nan, regex=True)
    df['Address'] = df['Address'].replace(r'^\s*$', np.nan, regex=True)
    df['Address'] = df['Address'].str.strip()
    df.insert(loc=1, column='Companyname_clean', value=df["Company_name"].str.lower())
    return df

In [3]:
def city_modify(df):   # standardizing city names with special characters
    df["City"] = df["City"].str.replace('\W', '')

In [4]:
def companyname_clean_modify(df):   # for the remaining suffix problem
    df["Companyname_clean"] = df["Companyname_clean"].str.replace('\W', '')

In [5]:
def dict_lowercase(dc):
    for i in dc.keys():            # values (city names) lower case and special characters are removed
        if type(dc[i]) is list:
            dc[i]=  [j.lower().replace(' ','').replace('-','') for j in dc[i]]
        else:
            dc[i] = dc[i].lower().replace(' ','').replace('-','')
    return dc

In [6]:
# Case 1
def city_finder(city):
    if len(city.split())>=3:
        cityname = [x for x in city.split() if x in geo_cities]   # city is actually address when city string >= 3 tokens
        result = ''.join(cityname)
        if result!='':
            return result
        else:
            return city
    return city

# Case 2
def country_filler1(city, country):
    country_name = [k for k, v in country_city_dict.items() if (city.upper() in k) & (city!='')]   # if city is actually a country and not a missing entry
    result = city.upper()
    if result != '':
        return result
    else:
        return country


# Case 3
def country_filler2(city, country):
    country_name = [k for k, v in country_city_dict.items() if city in v]   # country of the city
    result = ''.join(country_name)
    if result in country_city_dict.keys():
        return result
    else:
        return country

In [7]:
def country_mapper(df, df_diff, df_clean):             # Dataset country name standardizer
    country_dict = {}
    country_list = []
    for i in range(len(df_diff)):
        max_similarity = 0.75   #threshold
        most_similar_country = df_diff[i]
        #print(most_similar_country)
        for country in original_clear_bdata:
            similarity_score = SequenceMatcher(None, df_clean[i], country[1]).ratio() #compare pre_postfix cleared countrynames
            if similarity_score >= max_similarity:
                #print(similarity_score, country[1])
                max_similarity = similarity_score
                most_similar_country = country[0] #turn qdata country to most similar bdata country if similarityscore is above 0.75
    #         else:
    #             most_similar_country = qdatadiff_countries[i]
        country_list.append(most_similar_country)
        country_dict = dict(zip(df_diff, country_list))
    df['Country'] = df['Country'].map(country_dict).fillna(df['Country'])

In [8]:
def suffix_remover(companyname):    # Running multiple times (2 is good) removes suffix effectively
    for i in suffix_list:
        if i == companyname.split()[0]:    # tokenization and looking for suffix in a company name
            return ' '.join(companyname.split()[companyname.split().index(i)+1:])    # If first index is prefix then take rest of the company name
        elif i in companyname.split():
            return ' '.join(companyname.split()[:companyname.split().index(i)])      # If suffix is detected then remove suffix and rest of the company name
        elif i in companyname.split('-'):
            return ' '.join(companyname.split('-')[:companyname.split('-').index(i)])
    return companyname

In [9]:
def pre_middle_suffix_remove(companyname):   # appylying this function after special characters (+ white spaces) are removed
    for i in remove_list:
        if i in companyname:
            return companyname.replace(i, '')
    return companyname

In [10]:
def mto_columns(df):
    df['Address_mto'] = df.sort_values(by=['Address'],na_position='last').groupby(['Companyname_clean', 'Zipcode'])['Address'].transform('first')
    df['Companyname_mto'] = df.groupby('Companyname_clean')['Company_name'].transform('first')

In [11]:
# Many to One company finding function
def find_comp(x,y,z,q):
    x= x.strip().lower()   # company name
    y= y.strip().upper()   # country
    z= z.strip().lower()   # city
    q= q.strip()           #zipcode
    result = [k for k, v in total.items() if (x, y, z, q) in v]            # exact match
    result_alt1 = [k for k, v in total.items() if (x, y, z, '-') in v]         # in case zip is blank
    result_alt2 = [k for k, v in total.items() if (x, y, '-', q) in v]     # in case city is blank
    result_alt3 = [k for k, v in total.items() if (x, '-', z, q) in v]     # in case country is blank
    result_alt4 = [k for k, v in total.items() if (x.replace(' ',''), y, z, q) in v]   # remove whitespace (captures most entries without their suffix)
    result_close = [k for k, v in total.items() if (' '.join(x.split()[:-1]), y, z, q) in v] # less than what you entered recommendation: input Fraport AG 2 ==> finds Fraport AG
    
    if result != []:
        return result
    elif result_alt1 != []:
        return result_alt1
    elif result_alt2 != []:
        return result_alt2
    elif result_alt3 != []:
        return result_alt3
    elif result_alt4 != []:
        return result_alt4
    elif result_close != []:
        #return "Are you looking for the company named *{}*?".format(' '.join(x.split()[:-1]))  #E.g: find_comp('Fraport ag example', 'Germany', 'frankfurt')
        return result_close
# if no result until here;        
    for i in mapper.keys():     # Manuel rules here: Aktiengesellschaft=AG
        if i in x.split():      #E.g: find_comp('Volkswagen Financial Services aktiengesellschaft', 'Germany', 'braunschweig')
            #x = x.replace(i, mapper[i])
            result_mapped = [k for k, v in total.items() if (x.replace(i, mapper[i]), y, z, q) in v]      # exact match after mapping manuel rules
            result_mapped1 = [k for k, v in total.items() if (x.replace(i, mapper[i]), y, z, '-') in v]
            result_mapped2 = [k for k, v in total.items() if (x.replace(i, mapper[i]), y, '-', q) in v]
            result_mapped3 = [k for k, v in total.items() if (x.replace(i, mapper[i]), '-', z, q) in v]
            result_close2 =  [k for k, v in total.items() if (' '.join(x.replace(i, mapper[i]).split()[:-1]), y, z, q) in v]   # for the recommendation part
            if result_mapped != []:
                return result_mapped
            elif result_mapped1 != []:
                return result_mapped1
            elif result_mapped2 != []:
                return result_mapped2
            elif result_mapped3 != []:
                return result_mapped3
            elif result_close2!=[]:
                #return "Are you looking for the company named *{}*?".format(' '.join(x.replace(i, mapper[i]).split()[:-1]))
                return result_close2
        
    else:
        # Similarity measure later
        return 'Not available'

# 1. Read dataset

In [12]:
ulist = pd.read_excel("Unternehmensliste_Interflex.xlsx")
ulist = ulist.rename(columns={'Firmenname': 'Company_name', 'Land': 'Country', 'Postleitzahl': 'Zipcode', 'Ort': 'City', 'Straße und Hausnr.': 'Address'})
ulist.head()

Unnamed: 0,Company_name,Country,Zipcode,City,Address,USt.-IdNr.,ID
0,Bistum Essen,Deutschland,45127,Essen,Zwölfling 16,,801695.0
1,KBS Kleider Bauer Betriebs-GmbH,Österreich,2380,Perchtoldsdorf,Zwingenstr. 5,,850801.0
2,Elektro Ing-Plan GmbH Dresden,Deutschland,1187,Dresden,Zwickauer Straße 88,,851014.0
3,Netzdesign-Vobornik,Deutschland,71083,Herrenberg,Zwickauer Strasse 41,,
4,NILES-SIMMONS Industrieanlagen GmbH,Deutschland,9117,Chemnitz,Zwickauer Straße 355,DE140853999,802541.0


In [14]:
qdata_cols = ['Year', 'Name', 'Country', 'City', 'ZipCode', 'Address']
qdata = pd.read_excel(path + "Quandl_Q3.xlsx", usecols=qdata_cols)[qdata_cols]
qdata = qdata.rename(columns={'Name': 'Company_name', 'ZipCode': 'Zipcode'})
qdata.head()

Unnamed: 0,Year,Company_name,Country,City,Zipcode,Address
0,2021,"& Factory, Inc.",Japan,Meguro,153-0042,3-6-28 Aobadai. Meguro-Ku
1,2020,"& Factory, Inc.",Japan,Meguro,153-0042,3-6-28 Aobadai. Meguro-Ku
2,2019,"& Factory, Inc.",Japan,Meguro,153-0042,3-6-28 Aobadai. Meguro-Ku
3,2018,"& Factory, Inc.",Japan,Meguro,153-0042,3-6-28 Aobadai. Meguro-Ku
4,2021,"&Do Holdings Co., Ltd.",Japan,Kyoto,100-0005,670 Tearaimizu-Cho. Nakagyo-Ku


In [15]:
bdata_cols = ['company_name', 'country', 'city', 'zip', 'street']
bdata = pd.read_csv(path + "20220831_BoldData_Q3.csv", usecols=bdata_cols)[bdata_cols]
bdata = bdata.rename(columns={'company_name': 'Company_name', 'country': 'Country', 'city': 'City', 'zip': 'Zipcode', 'street': 'Address'})
bdata.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Company_name,Country,City,Zipcode,Address
0,Chaussures Carrefour S.E.N.C,CANADA,GRANBY,J2J 0C2,226 rue Saint-Jude N
1,"Produits Marins F.G.R. Inc, Les",CANADA,L'tang-du-Nord,G4T 3V9,1540 ch Des Caps
2,Catherine Fournier Notaire Inc,CANADA,BEACONSFIELD,H9W 5S3,186 Sutton Pl bureau 1
3,Ferme Cripoval Inc.,CANADA,SAINT-JULES,G0N 1R0,270 Rang Saint-Bruno
4,3181243 Canada Inc.,CANADA,Paspbiac,G0C 2K0,180 Boul Grard-D.-Levesque O


In [16]:
with open('countries_by_namede_modified.yml.txt', 'rt', encoding='utf8') as file:   
     yaml_dict = yaml.load(file)

country_dict = yaml_dict.get('de').get('countries')
country_dict = {y: x for x, y in country_dict.items()}
country_dict = {k.upper():v.upper() for k,v in country_dict.items()}
country_dict = {k.replace('-',' '): v.replace('-',' ') for k, v in country_dict.items()}

country_dict.update({'MALTA\xa0(MALTA)':'MALTA', 'PUERTO RICO\xa0(PUERTO RICO)': 'PUERTO RICO', 'THAILAND\xa0(THAILAND)': 'THAILAND', 'ISRAEL\xa0(ISRAEL)': 'ISRAEL',
                     'JORDAN\xa0(JORDANIEN)': 'JORDAN', 'MALAYSIA\xa0(MALAYSIA)': 'MALAYSIA', 'GREECE\xa0(GRIECHENLAND)': 'GREECE', 'MACEDONIA\xa0(MAZEDONIEN)': 'MACEDONIA',
                     'KUWAIT\xa0(KUWAIT)': 'KUWAIT', 'SINGAPORE\xa0(SINGAPUR)': 'SINGAPORE', 'QATAR\xa0(KATAR)': 'QATAR', 'KAZAKHSTAN\xa0(KASACHSTAN)': 'KAZAKHSTAN',
                    'JAPAN\xa0(JAPAN)': 'JAPAN', 'SOUTH KOREA\xa0(KOREA, SÜD)': 'SOUTH KOREA', 'VIETNAM\xa0(VIETNAM)': 'VIETNAM', 'SERBIA\xa0(SERBIEN)': 'SERBIA',
                    'PHILIPPINES\xa0(PHILIPPINEN)': 'PHILIPPINES', 'NEW ZEALAND\xa0(NEUSEELAND)': 'NEW ZEALAND', 'RUSSLAND': 'RUSSIAN FEDERATION', 'ESTLAND': 'ESTONIA',
                     'TSCHECHIEN': 'CZECH REPUBLIC', 'CROATIA_HRVATSKA': 'CROTIA', 'VEREINIGTE EMIRATE': 'UNITED ARAB EMIRATES', 'KÖNIGREICH SAUDI-ARABIEN': 'SAUDI ARABIA',
                     'SAUDI ARABIEN': 'SAUDI ARABIA', 'ESTLAND)': 'ESTONIA',
                    'US':'UNITED STATES', 'USA': 'UNITED STATES', 'UK': 'UNITED KINGDOM', 'UA': 'UNITED ARAB EMIRATES'})

  


In [17]:
country_citydf = pd.read_csv('worldcities.csv', usecols=['country', 'city_ascii'])   #https://simplemaps.com/data/world-cities
country_citydf.head()

country_city_dict = {k: g["city_ascii"].tolist() for k,g in country_citydf.groupby("country")}
country_city_dict = {k.upper():v for k,v in country_city_dict.items()}

country_city_dict = dict_lowercase(country_city_dict)

geo_countries = list(country_city_dict.keys())                              # list of all countries
geo_cities = [j for sub in list(country_city_dict.values()) for j in sub]   # list of all cities

# 2. Dataset Specific Operations

## 2.1 Ulist

In [18]:
ulist = columns_preprocess(ulist)
ulist['Country'] = ulist['Country'].map(country_dict).fillna(ulist['Country'])   # De-Eng country name mapper
ulist_cases = ulist[(ulist['Country']=='NAN') & (ulist['City']!='nan')]    # all the entries with missing country names

In [19]:
print('Country missing but city is given=', len(ulist[(ulist['Country']=='NAN') & (ulist['City']!='nan')]))
print('City is missing but country is given=', len(ulist[(ulist['Country']!='NAN') & (ulist['City']=='nan')]))
print('Both Country and city missing=', len(ulist[(ulist['Country']=='NAN') & (ulist['City']=='nan')]))

Country missing but city is given= 903
City is missing but country is given= 478
Both Country and city missing= 170


In [20]:
a = ulist.groupby('City').Country.nunique() > 2     # cities appear in more than a country
a[a].index.tolist()                    # schwarzach, burgdorf

# https://stackoverflow.com/questions/54518504/check-if-group-contains-same-value-in-pandas

['burgdorf',
 'deurne',
 'linz',
 'london',
 'nan',
 'neunkirchen',
 'puebla',
 'schwarzach']

In [21]:
# Examples
# Case 1
print('Case 1. City name error:\n\n', ulist[ulist['Companyname_clean'].str.startswith('eqos')])
print('\n{} entries are longer than 2 tokens'.format(len([x for x in ulist['City'].str.split() if len(x)>=3])))

# Case 2
print('\n\nCase 2. City name is actually a country name\n', ulist_cases[ulist_cases['City'].str.upper().isin(geo_countries)].tail(5))    # spain ==> SPAIN

# Case 3
print('\n\nCase 3. Find country from city name dictionary:\n', ulist_cases[ulist_cases['City'].isin(geo_cities)].head())   # naila ==> GERMANY

# Case 4
print('\n\nCase 4. Map most common country grouped by city name\n', ulist[ulist['City']=='valencia']['Country'].value_counts().sort_values(ascending=False))
print('\nExample 2 Moskau:', ulist[ulist['City']=='moskau']['Country'].value_counts().sort_values(ascending=False))
      #adelberg, senningerberg, moskau and valencia, haag)

Case 1. City name error:

        Company_name Companyname_clean Country Zipcode  \
14351  Eqos Energie      eqos energie     NAN     NaN   
14364          Eqos              eqos     NAN     NaN   

                                                City Address USt.-IdNr.  \
14351  hinterbergerstraße 13  4400 steyr  österreich     NaN        NaN   
14364                                          steyr     NaN        NaN   

           ID  
14351  850821  
14364  850713  

523 entries are longer than 2 tokens


Case 2. City name is actually a country name
                                      Company_name  \
13692            ams-OSRAM Asia Pacific Pte. Ltd.   
13791  TTS Tooltechnic Systems North  America, LP   
13858      Boehringer Ingelheim Singapore Pte Ltd   
13989            Schaeffler (Singapore) Pte. Ltd.   
14363     VIESGO INFRAESTRUCTURAS ENERGETICAS S.L   

                                Companyname_clean Country   Zipcode  \
13692            ams-osram asia pacific pte. ltd.  

In [22]:
# Case 1
ulist['City'] = ulist['City'].apply(city_finder)

# Case 2
city_modify(ulist)
case2_index = ulist_cases[ulist_cases['City'].str.upper().isin(geo_countries)].index
ulist.loc[case2_index, 'Country'] =  ulist.loc[case2_index].apply(lambda x: country_filler1(x.City, x.Country), axis=1)   # gonna mask these and continue with next cases

# Case 3
case3_index = ulist_cases[ulist_cases['City'].isin(geo_cities)].index
ulist.loc[case3_index, 'Country'] = ulist.loc[case3_index].apply(lambda x: country_filler2(x.City, x.Country), axis=1)

# Case 4
ulist['City'] = ulist['City'].replace('nan', np.nan)
ulist['Country'] = ulist['Country'].replace('NAN', np.NaN)
case4_index = ulist[ulist['Country'].isna()].index

s = ulist.groupby(['City','Country']).size()
df = s.loc[s.groupby(level=0).idxmax()].reset_index().drop(0,axis=1)
city_country_filler = dict(df.values)

ulist.loc[case4_index, 'Country'] = ulist.loc[case4_index, 'City'].map(city_country_filler)

In [23]:
ulist[ulist['Companyname_clean'].str.startswith('eqos')]   # case 1

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
14351,Eqos Energie,eqos energie,AUSTRIA,,steyr,,,850821
14364,Eqos,eqos,AUSTRIA,,steyr,,,850713


In [24]:
ulist.loc[case2_index].tail(5)    # case 2

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
13692,ams-OSRAM Asia Pacific Pte. Ltd.,ams-osram asia pacific pte. ltd.,SINGAPORE,569877,singapore,7000 Ang Mo Kio Ave 5,,290745
13791,"TTS Tooltechnic Systems North America, LP","tts tooltechnic systems north america, lp",UNITED STATES,IN 46052,lebanon,400 N. Enterprise Blvd.,,804190
13858,Boehringer Ingelheim Singapore Pte Ltd,boehringer ingelheim singapore pte ltd,SINGAPORE,199555,singapore,300 Beach Road,,802212
13989,Schaeffler (Singapore) Pte. Ltd.,schaeffler (singapore) pte. ltd.,SINGAPORE,556741,singapore,151 Lorong Chuan #06-01,,800092
14363,VIESGO INFRAESTRUCTURAS ENERGETICAS S.L,viesgo infraestructuras energeticas s.l,SPAIN,,spain,,,850166


In [25]:
ulist.loc[case3_index].head()    # case 3

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
41,SERAG-WIESSNER GmbH & Co. KG,serag-wiessner gmbh & co. kg,GERMANY,95119,naila,Zum Kugelfang 8-12,,
112,Amtsgericht Ulm,amtsgericht ulm,GERMANY,89073,ulm,Zeughausgasse 14,,
148,ALUTEC SYSTEM-TECHNIK GMBH,alutec system-technik gmbh,GERMANY,74385,pleidelsheim,Zeppelinstr. 11,,851837.0
230,Amtsgericht Bad Waldsee,amtsgericht bad waldsee,GERMANY,88339,waldsee,Wurzacher Str. 73,,
253,Aircraft Cabin Modification GmbH,aircraft cabin modification gmbh,GERMANY,87700,memmingen,Woringer Straße 11,,851303.0


In [26]:
ulist[ulist['City']=='moskau']    # case 4

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
1198,Aeroflot,aeroflot,RUSSIAN FEDERATION,119002.0,moskau,ul. Arbat 10,,803708
4392,Hyperglobus Globus Hypermarkets,hyperglobus globus hypermarkets,RUSSIAN FEDERATION,125493.0,moskau,Personalabteilung,7743543761.0,160024
7143,Elemash,elemash,RUSSIAN FEDERATION,,moskau,Karl-Marx-Street,,851849
13946,"JSC ""Vnukova Airport""","jsc ""vnukova airport""",RUSSIAN FEDERATION,119027.0,moskau,"1st, Reysovaya Str.",,800439
14359,DME - Flughafen Domodedevo,dme - flughafen domodedevo,RUSSIAN FEDERATION,,moskau,,,803710


## 2.2 Bdata

In [27]:
# Update the city of an entry to zip when second letter of country is lower case. After that, update the country of an entry to city. Later, match zipcodes with country names
country_err = bdata.loc[(bdata["Country"].str[1].str.islower()) | (bdata["Country"].str[2].fillna('').str.islower())].index

bdata.loc[country_err, "Zipcode"] = bdata.loc[country_err, "City"]
bdata.loc[country_err, "City"] = bdata.loc[country_err, "Country"]

#1 country name entries with digits and special character ('\\N')
country_err2 = bdata.loc[bdata["Country"].str.startswith('\\N'), "Country"].index   # new row character
#2 country name entries with with digits
country_err3 = bdata.loc[bdata["Country"].str.contains('(\d+)'), "Country"].index   # contains digit

# Country column rows to be updated
country_update_index = list(country_err) + list(country_err2) + list(country_err3)

country_updater = [x+1 for x in country_update_index]
bdata.loc[country_update_index, "Country"] = bdata.loc[country_updater, "Country"].values

  return func(self, *args, **kwargs)


In [28]:
bdata = columns_preprocess(bdata)    # preprocess bdata columns
city_modify(bdata)                   # remove special characters and whitespace from city names

In [29]:
bdata.loc[bdata["Country"].str.startswith('\\N'), "Country"] = None    # Country name \\N are replaced with None
bdata['Country'] = bdata['Country'].fillna(method = 'ffill')           # Missing country names are filled with the previous notnull entry

# 2.3 Qdata

In [30]:
qdata = (
    qdata
        .assign(
            Year=qdata.Year.astype("int16"),
            Country=qdata.Country.astype("category")   
        )
)

qdata = qdata[qdata["Year"]==2021].drop(columns=["Year"]).reset_index(drop=True)   # Take only one year data

In [31]:
qdata = columns_preprocess(qdata)    # preprocess qdata columns
city_modify(qdata)                   # remove special characters and whitespace from city names

# 3. Dataset feature standardization

In [32]:
bdata_countries = list(bdata["Country"].unique())             # Ground truth country names from bdata

qdatadiff_countries = list(set(qdata["Country"].unique()) - set(bdata["Country"].unique()))
ulistdiff_countries = list(set(ulist["Country"].unique()) - set(bdata["Country"].unique()))
del ulistdiff_countries[0]   # nan
print('Number of different country names in Qdata data:', len(qdatadiff_countries))
print('Number of different country names in Ulist data:', len(ulistdiff_countries))

Number of different country names in Qdata data: 14
Number of different country names in Ulist data: 7


In [33]:
prefix_postfix = ['OF', 'AND', '&', 'THE', 'REP', 'IS', 'ST', 'SAINT', 'FED', 'N', 'NORTH', 'SOUTH', 'EAST', 'WEST', 'UK', 'FEDERATION', '(BURMA)']    # prefix to be removed

bdata_clear = []
for country in range(len(bdata_countries)):
    without_prefix = [x for x in bdata_countries[country].split() if x not in prefix_postfix]
    result = ' '.join(without_prefix)
    bdata_clear.append(result)
original_clear_bdata = list(zip(bdata_countries, bdata_clear))   #to work with similarity score


qdatadiff_clear = []
for country in range(len(qdatadiff_countries)):
    without_prefix = [x for x in qdatadiff_countries[country].split() if x not in prefix_postfix]
    result = ' '.join(without_prefix)
    qdatadiff_clear.append(result)
    
ulistdiff_clear = []
for country in range(len(ulistdiff_countries)):
    without_prefix = [x for x in ulistdiff_countries[country].split() if x not in prefix_postfix]
    result = ' '.join(without_prefix)
    ulistdiff_clear.append(result)
    
# https://stackoverflow.com/questions/25346058/removing-list-of-words-from-a-string

In [34]:
country_mapper(ulist, ulistdiff_countries, ulistdiff_clear)     # Dataset countries are standardized (Ground truth country names are from bdata)
country_mapper(qdata, qdatadiff_countries, qdatadiff_clear)

In [35]:
ulist[ulist.City.astype(str).str.startswith('new')]

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID
6950,C/o Boots,c/o boots,UNITED KINGDOM,11111,newcastle,Kingston Retail Park,,88002918
13785,Kaestle Boos Architects,kaestle boos architects,UNITED STATES,11111,newbritain,416 Slater Road,,8400524
13828,RGA Associates,rga associates,UNITED STATES,11111,newyork,350 West 39th St.,,84005233
13904,"Brose New Boston, Inc.","brose new boston, inc.",UNITED STATES,MI 48164,newboston,23400 Bell Road,,802300
13905,Brose North America,brose north america,UNITED STATES,11111,newboston,23400 Bell Rd,,84005296
13911,BMW of North America,bmw of north america,UNITED STATES,11111,newbury,2201 Corporate Center Dr.,,840052112
14180,Continental Automotive Systems,continental automotive systems,UNITED STATES,11111,newport,,,84005297


# 4. Suffix work (Currently only for Ulist data)

In [36]:
# Suffix list for company names (Will be improved)
suffix_list = ['gmbh', 'gmbh&co.', 'ggmbh', 'gmbh+co.', 'kg-gmbh', 'gmbh&co', 'ltd.', 'se',
      'gmbh&co.kg', 'gmbh&cokg', 'gmbh.', 'gmbh,', 'gmbh&c', '(gmbh', 'mbh',
               'company', 'incorporated', 'corporation', 'corp.', 'corp', 'inc',
      '& co.', '& co',  'inc.', 's.p.a.', 'n.v.', 'a.g.', 'ag', 'nuf', 's.a.', 's.f.',
      'oao', 'co.', 'co',
              'soc.col.', 'stg', 'd.n.o.', 'ltda.', 'v.o.s.', 'a spol.',
      u've\xc5\x99. obch. spol.', 'kgaa', 'o.e.', 's.f.', 's.n.c.', 's.a.p.a.', 'j.t.d.',
      'v.o.f.', 'sp.j.', 'og', 'sd', ' i/s', 'ay', 'snc', 'oe', 'bt.', 's.s.', 'mb',
      'ans', 'da', 'o.d.', 'hb', 'pt',
              'unltd', 'ultd', 'sal', 'unlimited', 'saog', 'saoc', 'aj',
      'yoaj', 'oaj', 'akc. spol.', 'a.s.',
              'esv', 'gie', 'kv.', 'qk',
              'pty. ltd.', 'pty ltd', 'ltd', 'l.t.d.', 'bvba', 'd.o.o.', 'ltda', 'gmbh',
      'g.m.b.h', 'kft.', 'kht.', 'zrt.', 'ehf.', 's.a.r.l.', 'd.o.o.e.l.', 's. de r.l.',
      'b.v.', 'tapui',
      'sp. z.o.o.', 'sp. z o.o.', 'spółka z o.o.',
      's.r.l.', 's.l.', 's.l.n.e.', 'ood', 'oy', 'rt.',
      'teo', 'uab', 'scs', 'sprl', 'limited', 'bhd.', 'sdn. bhd.', 'sdn bhd', 'as',
      'lda.', 'tov', 'pp',
              'pllc', 'llc', 'l.l.c.', 'plc.', 'plc', 'hf.', 'oyj',
      'a.e.', 'nyrt.', 'p.l.c.', 'sh.a.', 's.a.', 's.r.l.', 'srl.', 'srl', 'aat', '3at', 'd.d.',
      's.r.o.', 'spol. s r.o.', 's.m.b.a.', 'smba', 'sarl', 'nv', 'sa', 'aps',
      'a/s', 'p/s', 'sae', 'sasu', 'eurl', 'ae', 'cpt', 'as', 'ab', 'asa', 'ooo', 'dat',
      'vat', 'zat', 'mchj', 'a.d.',
              'lllp', 'l.l.l.p.',
              'llp', 'l.l.p.', 'sp.p.', 's.c.a.', 's.c.s.',
              'gmbh & co. kg', 'lp', 'l.p.', 's.c.s.',
      's.c.p.a', 'comm.v', 'k.d.', 'k.d.a.', 's. en c.', 'e.e.', 's.a.s.', 's. en c.',
      'c.v.', 's.k.a.', 'sp.k.', 's.cra.', 'ky', 'scs', 'kg', 'kd', 'k/s', 'ee', 'secs',
      'kda', 'ks', 'kb','kt',
              'sicav',
              'nl',
              'vzw', 'ses.', 'gte.',
              'private', 'pte', 'xk',
              'p.c.', 'vof', 'snc',
              'pllc', 'p.l.l.c.',
              'e.u.', 's.p.', 't:mi', 'tmi', 'e.v.', 'e.c.', 'et', 'obrt',
      'fie', 'ij', 'fop', 'xt']

# Source of company suffices https://github.com/psolin/cleanco/blob/master/cleanco/termdata.py

### Find suffix in company name tokens

In [37]:
ulist['Companyname_clean'] = ulist['Companyname_clean'].apply(suffix_remover)
ulist['Companyname_clean'].head(10)

In [54]:
# Bug fixed
companyname='volkswagen financial services ag'
i='ag'
if i in companyname.split():
    print(companyname[:companyname.index(i)-1])
    
if i in companyname.split():
    print(' '.join(companyname.split()[:companyname.split().index(i)]))

volks
volkswagen financial services


In [55]:
# Prefix updated
companyname='as watson'
pref=['as', 'gmbh', 'ds']
  
for i in pref:
    if i == companyname.split()[0]:
        print(' '.join(companyname.split()[companyname.split().index(i)+1:]))
    elif i in companyname.split():
        print(' '.join(companyname.split()[:companyname.split().index(i)]))

watson


### Find suffix and prefix in company name tokens

In [56]:
print('Remaining suffix as gmbh (most common):', len(ulist[ulist['Companyname_clean'].str.contains('gmbh')]))

Remaining suffix as gmbh (most common): 0


In [38]:
companyname_clean_modify(ulist)       # remove whitespace in suffix&prefix removed companynames
companyname_clean_modify(bdata)
companyname_clean_modify(qdata)

In [57]:
print('Original Companyname:', ulist.loc[554, 'Company_name'])   # An Example with GmbH
print('Cleaned Companyname:', ulist.loc[554, 'Companyname_clean'])

Original Companyname: Technimark-Eisbär GmbHKunststoff- und Metallverarb.
Cleaned Companyname: technimarkeisbärkunststoffundmetallverarb


In [39]:
remove_list = ['gmbh']  # Remove from companynames (Will be extended)

ulist['Companyname_clean'] = ulist['Companyname_clean'].apply(pre_middle_suffix_remove)
bdata['Companyname_clean'] = bdata['Companyname_clean'].astype(str).apply(pre_middle_suffix_remove)
qdata['Companyname_clean'] = qdata['Companyname_clean'].apply(pre_middle_suffix_remove)

In [40]:
ulist['Companyname_clean'] = ulist['Companyname_clean'].apply(pre_middle_suffix_remove)
bdata['Companyname_clean'] = bdata['Companyname_clean'].astype(str).apply(pre_middle_suffix_remove)
qdata['Companyname_clean'] = qdata['Companyname_clean'].apply(pre_middle_suffix_remove)

In [43]:
print('Remaining suffix as gmbh (most common):', len(ulist[ulist['Companyname_clean'].str.contains('gmbh')]))
print('Remaining suffix as gmbh (most common):', len(bdata[bdata['Companyname_clean'].str.contains('gmbh', na=False)]))
print('Remaining suffix as gmbh (most common):', len(qdata[qdata['Companyname_clean'].str.contains('gmbh')]))

Remaining suffix as gmbh (most common): 0
Remaining suffix as gmbh (most common): 0
Remaining suffix as gmbh (most common): 0


# Many-to-One Dictionary

In [44]:
mto_columns(ulist)
# mto_columns(bdata)
# mto_columns(qdata)

In [45]:
ulist[ulist['Companyname_clean'].str.startswith('aurubis')]   # simple example

Unnamed: 0,Company_name,Companyname_clean,Country,Zipcode,City,Address,USt.-IdNr.,ID,Address_mto,Companyname_mto
717,AURUBIS BELGIUM,aurubisbelgium,BELGIUM,2250,olen,WATERTORENSTRAAT 35,,320482.0,WATERTORENSTRAAT 35,AURUBIS BELGIUM
8016,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestrasse 50,,,Hovestr. 50,Aurubis AG
8017,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestr. 50,DE118514155,803007.0,Hovestr. 50,Aurubis AG
8018,Aurubis AG,aurubis,GERMANY,20539,hamburg,Hovestr. 50,DE118514155,160616.0,Hovestr. 50,Aurubis AG


In [46]:
ulist.groupby(['Companyname_clean', 'Country', 'City', 'Zipcode', 'Address_mto', 'Companyname_mto']).size().sort_values(ascending=False)[:516]  # 515 companies have multiple entries

Companyname_clean                  Country  City                 Zipcode  Address_mto                  Companyname_mto                                         
fraport                            GERMANY  frankfurt            60547    Airport Security Management  FRAPORT AG                                                  5
hiltideutschland                   GERMANY  kaufering            86916    Hiltistr. 2                  Hilti Deutschland AG                                        5
sappipapierholding                 AUSTRIA  gratkorn             8101     BRUCKER STRASSE 21           Sappi Papier Holding GmbH                                   4
interflexdatensysteme              GERMANY  stuttgart            70567    Epplestr. 225 Haus 3         Interflex Datensysteme GmbH                                 4
dhlhubleipzig                      GERMANY  schkeuditz           4435     Hermann-Koehl-Strasse 1      DHL Hub Leipzig GmbH c/o DHL Express Network                3
               

In [47]:
cleanname_values = ulist.assign(City=lambda ulista: ulista.City.fillna("-")).assign(Country=lambda ulista: ulista.Country.fillna("-")).assign(
    Zipcode=lambda ulista: ulista.Zipcode.astype(str).fillna("-")).assign(Address_mto=lambda ulista: ulista.Address_mto.fillna("-")).groupby(
    ['Companyname_clean', 'Country', 'City', 'Zipcode', 'Address_mto', 'Companyname_mto'])[['Companyname_clean',  'Country', 'City', 'Zipcode']].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

originalname_values = ulist.assign(City=lambda ulista: ulista.City.fillna("-")).assign(Country=lambda ulista: ulista.Country.fillna("-")).assign(
    Zipcode=lambda ulista: ulista.Zipcode.astype(str).fillna("-")).assign(Address_mto=lambda ulista: ulista.Address_mto.fillna("-")).groupby(
    ['Companyname_clean', 'Country', 'City', 'Zipcode', 'Address_mto', 'Companyname_mto'])[['Company_name',  'Country', 'City', 'Zipcode']].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

#keys are simplified company names and values are originals+simplified

In [48]:
# Creating a dictionary to store input-output pairs
total = {}
for x in originalname_values.keys():
    if x in originalname_values.keys():
        total[x] = originalname_values[x] + cleanname_values[x]

# Converts all company names in lower case so that function finds them more efficiently        
for i in total.keys():
    if type(total[i]) is list:
        total[i] = [(a.lower(),b, c, d) for a,b,c,d in total[i]]    

# Duplicated values in the keys are removed        
total = {k: set(v) for k,v in total.items()}

In [51]:
# ulist[ulist['Companyname_clean']=='freieundhansestadthamburg']    # complex example
{k: originalname_values[k] for k in list(originalname_values)[4923:4928]}
#[k for k, v in total.items() if ('freieundhansestadthamburg', 'GERMANY', 'hamburg', '20095') in v]   # include zipcode for input
#[k for k, v in total.items() if ('freieundhansestadthamburg', 'GERMANY', 'hamburg', '20097') in v]

{('freieundhansestadthamburg',
  'GERMANY',
  'hamburg',
  '20095',
  'Finanzbehörde Hamburg',
  'Freie und Hansestadt Hamburg'): [('Freie und Hansestadt Hamburg',
   'GERMANY',
   'hamburg',
   '20095'),
  ('Freie und Hansestadt Hamburg', 'GERMANY', 'hamburg', '20095')],
 ('freieundhansestadthamburg',
  'GERMANY',
  'hamburg',
  '20097',
  'Schulbau Hamburg',
  'Freie und Hansestadt Hamburg'): [('Freie und Hansestadt Hamburg',
   'GERMANY',
   'hamburg',
   '20097')],
 ('freieundhansestadthamburg',
  'GERMANY',
  'hamburg',
  '20354',
  'Kulturbehörde Hamburg',
  'Freie und Hansestadt Hamburg'): [('Freie und Hansestadt Hamburg',
   'GERMANY',
   'hamburg',
   '20354')],
 ('freieundhansestadthamburg',
  'GERMANY',
  'hamburg',
  '22222',
  'Behörde f. Stadtentw. u. Wohne',
  'Freie und Hansestadt Hamburg'): [('Freie und Hansestadt Hamburg',
   'GERMANY',
   'hamburg',
   '22222'),
  ('Freie und Hansestadt Hamburg', 'GERMANY', 'hamburg', '22222')],
 ('freieundhansestadthamburgbehfürarbe

In [52]:
# Suffix and company abbreviation pairs (Will be improved)
suffix_specific = {'aktiengesellschaft': 'ag', 'gesellschaft mit beschränkter haftung': 'gmbh'}
# add https://en.wikipedia.org/wiki/List_of_legal_entity_types_by_country

company_specific = {'volkswagen': 'vw'}
mapper = {**suffix_specific, **company_specific}

In [53]:
companyname='volkswagen financial services aktiengesellschaft'

for i in mapper.keys():
    if i in companyname.split():
        print(companyname.replace(i, mapper[i]))

volkswagen financial services ag
vw financial services aktiengesellschaft


In [43]:
# Some examples with different cases
print(find_comp('1Mustermann AG1', 'Germany', '-', '31135'))
print(find_comp('1Mustermann AG1 ', 'Germany', 'Frankfurt', '31135'))   # Actually '1Mustermann' is not located in Frankfurt but entry is without a city

print(find_comp('FRAPORT AG', 'Germany', 'Frankfurt', '60547'))
print(find_comp('FRAPORT AG', 'Germany', '-', '60547'))

print(find_comp('Volkswagen Financial Services aktiengesellschaft', 'Germany', 'braunschweig', '38112'))   # AG = Aktiengesellschaft
print(find_comp('Volkswagen Financial Services ag 2', 'Germany', 'braunschweig', '38112'))   # return original key
print(find_comp('Volkswagen Financial Services aktiengesellschaft 2', 'Germany', 'braunschweig', '38112'))
print(find_comp('Volkswagen Financial Services', 'Germany', 'braunschweig', '38112'))   # 3 options: remove whitespace or suffix addition or contained in 'Volkswagen Financial Services ag'
print(find_comp('VolkswagenFinancialServices', 'Germany', 'braunschweig', '38112'))
print(find_comp('VolkswagenFinancialService', 'Germany', 'braunschweig', '38112'))   # similary measure later

[('1mustermannag1', 'GERMANY', '-', '31135', 'Musterstr. 7', '1Mustermann AG1')]
[('1mustermannag1', 'GERMANY', '-', '31135', 'Musterstr. 7', '1Mustermann AG1')]
[('fraport', 'GERMANY', 'frankfurt', '60547', 'Airport Security Management', 'FRAPORT AG')]
Not available
[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', '38112', 'IH-IMC', 'Volkswagen Financial Services AG')]
[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', '38112', 'IH-IMC', 'Volkswagen Financial Services AG')]
[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', '38112', 'IH-IMC', 'Volkswagen Financial Services AG')]
[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', '38112', 'IH-IMC', 'Volkswagen Financial Services AG')]
[('volkswagenfinancialservices', 'GERMANY', 'braunschweig', '38112', 'IH-IMC', 'Volkswagen Financial Services AG')]
Not available
