In [1]:
import pandas as pd
import numpy as np

from pandarallel import pandarallel

import pytorch_warmup as warmup

pandarallel.initialize(progress_bar=True)

pd.options.display.max_colwidth = 500

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from unidecode import unidecode
import re

def set_seed(seed):
    """ Set all seeds to make results reproducible (deterministic mode).
        When seed is a false-y value or not supplied, disables deterministic mode. """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def clean_unit(text):
    text = unidecode(text)
    text = str(text).lower()

    text = text.replace("'", "")
    text = text.replace("`", "")
    
    # remove all special characters
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)

    # add whitespace after each number
    text = re.sub(r"([0-9]+(\.[0-9]+)?)", r" \1 ", text).strip()
    text = " ".join(text.split())
    
    return text


def text_preprocess(text):
    try:
        text = clean_unit(text)

    except:
        text = "unknown"

    return text

def preprocessing_data(df):
    df['blok_pes'] = df['blok_pes'].astype(int).astype(str)
    df['blok_sp'] = df['blok_pes'].astype(int).astype(str)
    df['jk_pes'] = df['jk_pes'].astype(str).apply(lambda x: 'pria' if x=='1' else 'wanita' if x=='2' else '')
    df['jk_sp'] = df['jk_sp'].astype(int).astype(str).apply(lambda x: 'pria' if x=='1' else 'wanita' if x=='2' else '')
    
    df['umur_pes'] = df['umur_pes'].astype(int).astype(str)
    df['umur_sp'] = df['umur_sp'].astype(int).astype(str)
    text_cols = ['nama_pes','nama_krt_pes','nama_sp','nama_krt_sp']
    for col in text_cols:
        df[col] = df[[col]].parallel_apply(lambda x: text_preprocess(x[col]),axis=1)
    
    # df['p1_ent'] = "[COL] nama [VAL] "+df['nama_pes']+" [COL] nama kepala keluarga [VAL] "+df['nama_krt_pes']+" [COL] umur [VAL] "+df['umur_pes']+" [COL] jenis kelamin [VAL] "+df['jk_pes']
    # df['p2_ent'] = "[COL] nama [VAL] "+df['nama_sp']+" [COL] nama kepala keluarga [VAL] "+df['nama_krt_sp']+" [COL] umur [VAL] "+df['umur_sp']+" [COL] jenis kelamin [VAL] "+df['jk_sp']
    
    return df

In [3]:
df = pd.read_csv("/home/amanda-putra/20221220_REGSOSEK/df v0.1 - Pairs.csv")
df = df[df['is_match'].isin([1])]
del df['weight']
df = df.sample(frac=1,random_state=2022).reset_index(drop=True)
df = preprocessing_data(df)
df.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4364), Label(value='0 / 4364'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4364), Label(value='0 / 4364'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4364), Label(value='0 / 4364'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4364), Label(value='0 / 4364'))), …

Unnamed: 0,blok_pes,id_ruta_pes,id_art_pes,no_kk_pes,nik_pes,nama_pes,nama_krt_pes,jk_pes,umur_pes,blok_sp,id_ruta_sp,id_art_sp,no_kk_sp,nik_sp,nama_sp,nama_krt_sp,jk_sp,umur_sp,is_match
0,20,C8E4803A-5A5D-4409-8884-D9CBB4BAA26E,2,9971042000000000.0,9971042000000000.0,zio putra fadilah,ubaidilah,pria,7,20,3045031.0,3.0,9971042000000000.0,9971042000000000.0,zio putra fadilah,ubaidiilah,pria,7,1
1,30,E8111282-FD1E-4307-A5BE-CEF376FF478A,1,9901156000000000.0,9901156000000000.0,fatma wailussy,taib pelu,wanita,52,30,2094128.0,2.0,9901156000000000.0,9901156000000000.0,fatma wailussy,taib pelu,wanita,53,1
2,29,EECAB60C-2FF5-4AA6-B185-0D0DC551B3D0,0,9901145000000000.0,9901145000000000.0,wa nasia,wa nasia,wanita,69,29,15031031.0,1.0,9999.0,9901145000000000.0,wanasia,wanasia,wanita,69,1
3,31,11B5AE1D-29FE-4363-B5A9-E9207FC7DF11,0,9901151000000000.0,9901151000000000.0,ibrahim latuapo,ibrahim latuapo,pria,51,31,2288288.0,1.0,9901151000000000.0,9901151000000000.0,ibrahim latuapo,ibrahim latuapo,pria,51,1
4,21,3E6579E8-2BDE-4A88-B5B1-A8D603C42DE7,7,9908030000000000.0,9908030000000000.0,syahzan maulana,abd samad,pria,2,21,2024026.0,8.0,9908030000000000.0,9908030000000000.0,syahzan maulana,abd samad,pria,3,1


In [4]:
df_pes = df[['blok_pes','id_ruta_pes','id_art_pes','no_kk_pes','nik_pes','nama_pes','nama_krt_pes','jk_pes','umur_pes']].drop_duplicates()
df_pes

Unnamed: 0,blok_pes,id_ruta_pes,id_art_pes,no_kk_pes,nik_pes,nama_pes,nama_krt_pes,jk_pes,umur_pes
0,20,C8E4803A-5A5D-4409-8884-D9CBB4BAA26E,2,9.971042e+15,9.971042e+15,zio putra fadilah,ubaidilah,pria,7
1,30,E8111282-FD1E-4307-A5BE-CEF376FF478A,1,9.901156e+15,9.901156e+15,fatma wailussy,taib pelu,wanita,52
2,29,EECAB60C-2FF5-4AA6-B185-0D0DC551B3D0,0,9.901145e+15,9.901145e+15,wa nasia,wa nasia,wanita,69
3,31,11B5AE1D-29FE-4363-B5A9-E9207FC7DF11,0,9.901151e+15,9.901151e+15,ibrahim latuapo,ibrahim latuapo,pria,51
4,21,3E6579E8-2BDE-4A88-B5B1-A8D603C42DE7,7,9.908030e+15,9.908030e+15,syahzan maulana,abd samad,pria,2
...,...,...,...,...,...,...,...,...,...
8723,10,FBD95E3A-29F8-4F9C-960E-349C932F2617,0,9.903201e+15,9.903201e+15,sahban,sahban,pria,52
8724,8,D7F9A282-AB1A-4FE7-AC12-0BB61207BE05,2,9.971186e+15,9.971186e+15,theresia,agus salim,wanita,29
8725,13,1D6634DE-E1FD-4591-8525-4A620E293E0A,3,9.971042e+15,9.971042e+15,muhammad suherman,suratman,pria,1
8726,5,FE7EBA52-8953-4004-950F-AD0A1A4F9C9C,0,9.907093e+15,9.907093e+15,siswono,siswono,pria,39


In [5]:
df_sp = df[['blok_sp','id_ruta_sp','id_art_sp','no_kk_sp','nik_sp','nama_sp','nama_krt_sp','jk_sp','umur_sp']].drop_duplicates()
df_sp

Unnamed: 0,blok_sp,id_ruta_sp,id_art_sp,no_kk_sp,nik_sp,nama_sp,nama_krt_sp,jk_sp,umur_sp
0,20,3045031.0,3.0,9.971042e+15,9.971042e+15,zio putra fadilah,ubaidiilah,pria,7
1,30,2094128.0,2.0,9.901156e+15,9.901156e+15,fatma wailussy,taib pelu,wanita,53
2,29,15031031.0,1.0,9.999000e+03,9.901145e+15,wanasia,wanasia,wanita,69
3,31,2288288.0,1.0,9.901151e+15,9.901151e+15,ibrahim latuapo,ibrahim latuapo,pria,51
4,21,2024026.0,8.0,9.908030e+15,9.908030e+15,syahzan maulana,abd samad,pria,3
...,...,...,...,...,...,...,...,...,...
8723,10,1022022.0,1.0,9.903201e+15,9.903201e+15,sahban,sahban,pria,52
8724,8,1058078.0,3.0,9.971186e+15,9.971186e+15,theresia,agus salim,wanita,30
8725,13,12004005.0,4.0,9.971042e+15,9.971042e+15,muhammad suherman,suratman,pria,2
8726,5,1068069.0,1.0,9.907093e+15,9.907093e+15,siswono,siswono,pria,39


In [7]:
import random

def generate_negative_sampling(df,n=100,seed=2022):
    df_pes = df[['blok_pes','id_ruta_pes','id_art_pes','no_kk_pes','nik_pes','nama_pes','nama_krt_pes','jk_pes','umur_pes']].drop_duplicates()
    df_sp = df[['blok_sp','id_ruta_sp','id_art_sp','no_kk_sp','nik_sp','nama_sp','nama_krt_sp','jk_sp','umur_sp']].drop_duplicates()

    df_pes_p = df_pes.sample(n=n,random_state=seed).reset_index(drop=True)
    df_sp_p = df_sp.sample(n=n,random_state=seed*2).reset_index(drop=True)
    df_comb = pd.concat([df_pes_p,df_sp_p],axis=1)
    df_comb['is_match'] = 2
    return df_comb


In [8]:
df_neg = generate_negative_sampling(df,n=8728)
df_neg

Unnamed: 0,blok_pes,id_ruta_pes,id_art_pes,no_kk_pes,nik_pes,nama_pes,nama_krt_pes,jk_pes,umur_pes,blok_sp,id_ruta_sp,id_art_sp,no_kk_sp,nik_sp,nama_sp,nama_krt_sp,jk_sp,umur_sp,is_match
0,2,87D12B1A-0B5D-4D20-87B6-BB79228114CD,5,9.907046e+15,9.907046e+15,resti enika br t,bengkel tarigan,wanita,12,6,1046056.0,3.0,9.907277e+15,9.907277e+15,dwi santika,supriadi,wanita,13,2
1,8,D7F9A282-AB1A-4FE7-AC12-0BB61207BE05,1,9.971185e+15,9.971185e+15,sujarli,agus salim,wanita,50,21,2004004.0,3.0,9.908031e+15,9.908031e+15,muh fauzan raihandi,masrijal,pria,17,2
2,8,F8F429E9-061B-42FD-844D-5DEA3E724691,5,9.971182e+15,9.971182e+15,alvaro marzuki,marzuki,pria,2,28,4013016.0,4.0,1.000000e+16,1.000000e+16,muhammad nur qalbi,zainuddin,pria,1,2
3,20,7AD26CB2-697D-4E2D-8B6D-C752EDB8B588,5,9.971046e+15,9.971046e+15,safira maulidia,samsul arifin,wanita,0,29,15058058.0,2.0,9.904105e+15,9.904105e+15,selvia ardila h,faidin kolowa,wanita,27,2
4,2,20DCEDCB-A3D4-4286-A379-76B56580B289,2,9.907046e+15,9.907046e+15,hannia br gurusinga,dani sah putra g,wanita,2,30,2046065.0,1.0,9.901150e+15,9.901150e+15,ridwan pelu,ridwan pelu,pria,45,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8723,30,C45E192E-5AF5-4012-95FA-98C5AA5867A1,0,9.901156e+15,9.901156e+15,asia tomu,asia tomu,wanita,63,14,4031038.0,1.0,9.971050e+15,9.971050e+15,m nur sodikin,m nur sodikin,pria,43,2
8724,20,F18CFE28-7B17-44F2-95C3-DBF56DCC8D09,3,9.902085e+15,9.902085e+15,jumi,mirah,wanita,17,5,1009009.0,4.0,9.907083e+15,9.907083e+15,sartono,musiran,pria,22,2
8725,13,C22DEC3C-7790-414D-9D06-85A433AFED89,4,9.971041e+15,9.971041e+15,fikri ardiansyah,m suwardi,pria,20,1,1052053.0,1.0,9.907035e+15,9.907035e+15,wagini,wagini,wanita,71,2
8726,24,20777D1D-B002-4154-9B6C-EB95265B8469,3,9.908190e+15,9.908190e+15,m akbar,amirullah,pria,17,24,3052060.0,1.0,9.908191e+15,9.908191e+15,amiruddin,amiruddin,pria,44,2


In [9]:
df_all = pd.concat([df,df_neg]).reset_index(drop=True)
df_all

Unnamed: 0,blok_pes,id_ruta_pes,id_art_pes,no_kk_pes,nik_pes,nama_pes,nama_krt_pes,jk_pes,umur_pes,blok_sp,id_ruta_sp,id_art_sp,no_kk_sp,nik_sp,nama_sp,nama_krt_sp,jk_sp,umur_sp,is_match
0,20,C8E4803A-5A5D-4409-8884-D9CBB4BAA26E,2,9.971042e+15,9.971042e+15,zio putra fadilah,ubaidilah,pria,7,20,3045031.0,3.0,9.971042e+15,9.971042e+15,zio putra fadilah,ubaidiilah,pria,7,1
1,30,E8111282-FD1E-4307-A5BE-CEF376FF478A,1,9.901156e+15,9.901156e+15,fatma wailussy,taib pelu,wanita,52,30,2094128.0,2.0,9.901156e+15,9.901156e+15,fatma wailussy,taib pelu,wanita,53,1
2,29,EECAB60C-2FF5-4AA6-B185-0D0DC551B3D0,0,9.901145e+15,9.901145e+15,wa nasia,wa nasia,wanita,69,29,15031031.0,1.0,9.999000e+03,9.901145e+15,wanasia,wanasia,wanita,69,1
3,31,11B5AE1D-29FE-4363-B5A9-E9207FC7DF11,0,9.901151e+15,9.901151e+15,ibrahim latuapo,ibrahim latuapo,pria,51,31,2288288.0,1.0,9.901151e+15,9.901151e+15,ibrahim latuapo,ibrahim latuapo,pria,51,1
4,21,3E6579E8-2BDE-4A88-B5B1-A8D603C42DE7,7,9.908030e+15,9.908030e+15,syahzan maulana,abd samad,pria,2,21,2024026.0,8.0,9.908030e+15,9.908030e+15,syahzan maulana,abd samad,pria,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17451,30,C45E192E-5AF5-4012-95FA-98C5AA5867A1,0,9.901156e+15,9.901156e+15,asia tomu,asia tomu,wanita,63,14,4031038.0,1.0,9.971050e+15,9.971050e+15,m nur sodikin,m nur sodikin,pria,43,2
17452,20,F18CFE28-7B17-44F2-95C3-DBF56DCC8D09,3,9.902085e+15,9.902085e+15,jumi,mirah,wanita,17,5,1009009.0,4.0,9.907083e+15,9.907083e+15,sartono,musiran,pria,22,2
17453,13,C22DEC3C-7790-414D-9D06-85A433AFED89,4,9.971041e+15,9.971041e+15,fikri ardiansyah,m suwardi,pria,20,1,1052053.0,1.0,9.907035e+15,9.907035e+15,wagini,wagini,wanita,71,2
17454,24,20777D1D-B002-4154-9B6C-EB95265B8469,3,9.908190e+15,9.908190e+15,m akbar,amirullah,pria,17,24,3052060.0,1.0,9.908191e+15,9.908191e+15,amiruddin,amiruddin,pria,44,2


In [10]:
df_all.to_feather('df v0.1 - Pairs_aug.feather')