# 1. Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/external_parties_train.csv")
df.head()

Unnamed: 0,transaction_reference_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,external_id
0,04ff0d1c680189e3a80c92d86407f0f5,BENE,mary mith 107 107 angela brooks n. thomasfurt ...,mary mith,angela brooks,107 107,,,n. thomasfurt,,bulgaria,GB49MYOB82127728573340,+1.815660-6791x8486,50039037
1,439ab0ad7380e6135ab2ff3fddd4a727,ORG,yesneia kim north michael 93971 koribati,yesneia kim,north michael,,,93971,koribati,,,,0 (269)620-8734x2349,60044692
2,00cac12d41191a84f9e31aa731a83512,ORG,w. roberson jr. 41010 rachel crossingapt. 923 ...,w. roberson jr.,rachel crossingapt.,41010 923,,p2235417,thompsonshire amyport,,,GB08OTHR53515837682953,,30008244
3,e4fba5f878dd3453e35973605a783a16,BENE,azquez-nelson co. suarez ports suite & 024 bri...,azquez-nelson co.,ports suite &,,,,brittanyberg,,bulgaria bulgaria,GB17VVGW66321494633280,,40017944
4,d03d7e4c31878b0255d39e8c3f0ab625,ORG,m.j. bytd iii 856 john lake s. glenn cocos (ke...,m.j. bytd iii,john lake s. glenn,856,,125838276,cocos (keeling),islands,,,(260)3371534,40012658


In [3]:
from normalize_text import *

# Clean names

df = clean_text_df(df, "parsed_name", name=True)
# Clean addresses

df = clean_text_df(df, "parsed_address_street_name", name=False)

# Delete common words for names
remove = [
    "mrs",
    "sr",
    "jr",
    "dr",
    "mr",
    "iii",
    "ii",
    "and",
    "phd",
    "iv",
    "v",
    "md",
    "dds",
    "dvm",
]

df = remove_words(df, "parsed_name", remove)

# 2. Feature Engineering

In [4]:
df.head()

Unnamed: 0,transaction_reference_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,external_id
0,04ff0d1c680189e3a80c92d86407f0f5,BENE,mary mith 107 107 angela brooks n. thomasfurt ...,mary mith,angela brooks,107 107,,,n. thomasfurt,,bulgaria,GB49MYOB82127728573340,+1.815660-6791x8486,50039037
1,439ab0ad7380e6135ab2ff3fddd4a727,ORG,yesneia kim north michael 93971 koribati,yesneia kim,north michael,,,93971,koribati,,,,0 (269)620-8734x2349,60044692
2,00cac12d41191a84f9e31aa731a83512,ORG,w. roberson jr. 41010 rachel crossingapt. 923 ...,w roberson,rachel crossingapt,41010 923,,p2235417,thompsonshire amyport,,,GB08OTHR53515837682953,,30008244
3,e4fba5f878dd3453e35973605a783a16,BENE,azquez-nelson co. suarez ports suite & 024 bri...,azquez nelson co,ports suite,,,,brittanyberg,,bulgaria bulgaria,GB17VVGW66321494633280,,40017944
4,d03d7e4c31878b0255d39e8c3f0ab625,ORG,m.j. bytd iii 856 john lake s. glenn cocos (ke...,mj bytd,john lake s glenn,856,,125838276,cocos (keeling),islands,,,(260)3371534,40012658


In [5]:
import jellyfish


def soundex(name):
    if pd.isnull(name) or name == "":
        return ""
    return jellyfish.soundex(name)


def metaphone(name):
    if pd.isnull(name) or name == "":
        return ""
    return jellyfish.metaphone(name)


def nysiis(name):
    if pd.isnull(name) or name == "":
        return ""
    return jellyfish.nysiis(name)


df["name_soundex"] = df["parsed_name"].apply(soundex)
df["name_metaphone"] = df["parsed_name"].apply(metaphone)
df["name_nysiis"] = df["parsed_name"].apply(nysiis)

print(df[["parsed_name", "name_soundex", "name_metaphone", "name_nysiis"]])

            parsed_name name_soundex name_metaphone      name_nysiis
0             mary mith         M653          MR M0         MARY NAT
1           yesneia kim         Y252         YSN KM        YASNA CAN
2           w roberson          W616         RBRSN       W RABARSAN 
3      azquez nelson co         A225    ASKS NLSN K   ASGAS NALSAN C
4              mj bytd          M213        MJ BTT          MJ BYTD 
...                 ...          ...            ...              ...
11059   james alvarado          J524     JMS ALFRT   JANAS ALVARADA 
11060        marcnguyen         M625        MRKNKYN       MARCNGAYAN
11061      joseph davis         J213        JSF TFS        JASAF DAV
11062       gonzalezltd         G524       KNSLSLTT      GANSALASLTD
11063    simmons conway         S552       SMNS KNW     SANANS CANWY

[11064 rows x 4 columns]


In [6]:
import pandas as pd


def is_company(name):
    if pd.isnull(name) or name == "":
        return 0  # Assume individual if name is missing
    company_keywords = [
        "ltd",
        "inc",
        "co",
        "corp",
        "llc",
        "plc",
        "limited",
        "incorporated",
        "company",
        "corporation",
        "gmbh",
        "kg",
        "llp",
        "pte",
        "pty",
        "sa",
        "sarl",
        "bv",
        "nv",
        "ag",
        "oy",
        "oyj",
        "ab",
        "spa",
        "srl",
        "sas",
        "kft",
        "ks",
        "spa",
        "sp",
        "llc",
        "llp",
        "group",
        "holdings",
        "partners",
        "associates",
        "international",
        "global",
    ]
    name = " ".join(name.split())
    for keyword in company_keywords:
        # Check if the keyword is a whole word in the name
        if re.search(r"\b" + re.escape(keyword) + r"\b", name):
            return 1  # It's a company
    return 0  # It's an individual


# Apply the function to create the 'is_company' feature
df["is_company"] = df["parsed_name"].apply(is_company)

# Display the DataFrame with the new feature
print(df[["parsed_name", "is_company"]])

            parsed_name  is_company
0             mary mith           0
1           yesneia kim           0
2           w roberson            0
3      azquez nelson co           1
4              mj bytd            0
...                 ...         ...
11059   james alvarado            0
11060        marcnguyen           0
11061      joseph davis           0
11062       gonzalezltd           0
11063    simmons conway           0

[11064 rows x 2 columns]


In [7]:
df.head(15)

Unnamed: 0,transaction_reference_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,external_id,name_soundex,name_metaphone,name_nysiis,is_company
0,04ff0d1c680189e3a80c92d86407f0f5,BENE,mary mith 107 107 angela brooks n. thomasfurt ...,mary mith,angela brooks,107 107,,,n. thomasfurt,,bulgaria,GB49MYOB82127728573340,+1.815660-6791x8486,50039037,M653,MR M0,MARY NAT,0
1,439ab0ad7380e6135ab2ff3fddd4a727,ORG,yesneia kim north michael 93971 koribati,yesneia kim,north michael,,,93971,koribati,,,,0 (269)620-8734x2349,60044692,Y252,YSN KM,YASNA CAN,0
2,00cac12d41191a84f9e31aa731a83512,ORG,w. roberson jr. 41010 rachel crossingapt. 923 ...,w roberson,rachel crossingapt,41010 923,,p2235417,thompsonshire amyport,,,GB08OTHR53515837682953,,30008244,W616,RBRSN,W RABARSAN,0
3,e4fba5f878dd3453e35973605a783a16,BENE,azquez-nelson co. suarez ports suite & 024 bri...,azquez nelson co,ports suite,,,,brittanyberg,,bulgaria bulgaria,GB17VVGW66321494633280,,40017944,A225,ASKS NLSN K,ASGAS NALSAN C,1
4,d03d7e4c31878b0255d39e8c3f0ab625,ORG,m.j. bytd iii 856 john lake s. glenn cocos (ke...,mj bytd,john lake s glenn,856,,125838276,cocos (keeling),islands,,,(260)3371534,40012658,M213,MJ BTT,MJ BYTD,0
5,948ae2b2fa0a7cafb33978779f443429,BENE,flores ltd 0169 dustin mount apt. 843 port dou...,flores ltd,dustin mount apt,843,,,port douglas new adam,,libyan arab jamahiriya,GB98QFKA10998873694870,(+880) +1320410-4875,10420140,F462,FLRS LTT,FLARAS LTD,1
6,9ac9861a0d0b133ac3cb122311fe3e28,BENE,christopher guutierrez 171 glenn plaza thomasf...,christopher guutierrez,glenn plaza,171,,d453602,thomasfort,,korea,,,40006918,C623,XRSTFR KTRS,CRASTAFAR GATAR,0
7,0d3400eb3113eb800604c1a7a1f2a6b3,BENE,kyle clark p.o. box 650 1876 illiajs plain sui...,kyle clark,illiajs plain suite,1876 880,,24073,jamesmouth blackland,,france,GB32CEZH43283581987924,"0 522271,.0649x.8382",25021900,K424,KL KLRK,CYLA CLARC,0
8,811c8949aa11f7bcd19f139e9f7d58ae,BENE,phillips incorporated 201 donna union suite 252,phillips incorporated,donna union suite,201 252,,,,,,,240-6293662,25021951,P412,FLPS INKRPRTT,FALAPS ANCARPARATAD,1
9,4181e1fcc4d4b029134f7b38ace018c2,BENE,"guerrero, fisher and durham 8206 jill ridge",guerrero fisher durham,,,,8206,jill ridge,,,,0505-337-4623,10273918,G661,KRR FXR TRHM,GARARA FASAR DARAN,0


In [8]:
# 1. Function to split parsed_name into given names and surname
def split_name(name):
    if pd.isnull(name) or name.strip() == "":
        return pd.Series({"given_name": "", "surname": ""})
    name = name.strip()
    name_parts = name.split()
    surname = name_parts[-1]
    given_name = " ".join(name_parts[:-1])
    return pd.Series({"given_name": given_name.strip(), "surname": surname.strip()})


# Apply the function to create 'given_name' and 'surname' columns
df[["given_name", "surname"]] = df["parsed_name"].apply(split_name)

df["surname_soundex"] = df["surname"].apply(soundex)
df["surname_metaphone"] = df["surname"].apply(metaphone)
df["surname_nysiis"] = df["surname"].apply(nysiis)

# 3. Create additional features for the surname
df["surname_length"] = df["surname"].apply(len)

# Display the DataFrame with the new features
print(
    df[
        [
            "parsed_name",
            "given_name",
            "surname",
            "surname_soundex",
            "surname_metaphone",
            "surname_nysiis",
            "surname_length"
        ]
    ]
)

            parsed_name     given_name      surname surname_soundex  \
0             mary mith           mary         mith            M300   
1           yesneia kim        yesneia          kim            K500   
2           w roberson               w     roberson            R162   
3      azquez nelson co  azquez nelson           co            C000   
4              mj bytd              mj         bytd            B300   
...                 ...            ...          ...             ...   
11059   james alvarado           james     alvarado            A416   
11060        marcnguyen                  marcnguyen            M625   
11061      joseph davis         joseph        davis            D120   
11062       gonzalezltd                 gonzalezltd            G524   
11063    simmons conway        simmons       conway            C500   

      surname_metaphone surname_nysiis  surname_length  
0                    M0            MAT               4  
1                    KM          

In [9]:
df.head(15).to_csv("example.csv", index=False)