# Sim Network

In [13]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import combinations

from translate import Translation

In [176]:
class cfg:
    TRAINING_DATA_PATH = './data/train.csv'
    TRAINING_PAIRS_PATH = './data/train_pairs.csv'
    TESTING_DATA_PATH = './data/test.csv'
    PAIRS_LABEL_PATH = './data/pairs_label.csv'
    MODEL_PATH = './checkpoint/checkpoint_10.pt'
    CHOICE_FEATURE = ['name', 'latitude', 'longitude', 'city', 'state', 'zip', 'country', 'categories']

    RAND_SEED = 3
    TEST_SIZE = 0.2
    CHUNK_SIZE = 10000
    LOG_STEPS = 10

    BATCH_SIZE = 64
    LR = 0.0002

In [15]:
pair1_cols = ['name1', 'latitude1', 'longitude1', 'city1', 'state1', 'zip1', 'country1', 'categories1']
pair2_cols = ['name2', 'latitude2', 'longitude2', 'city2', 'state2', 'zip2', 'country2', 'categories2']

## Prepare data

In [5]:
df = pd.read_csv(cfg.TRAINING_DATA_PATH)
df = df[['id'] + cfg.CHOICE_FEATURE + ['point_of_interest']]
# df = df[10000:20000].reset_index()
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/train.csv'

In [4]:
df = df.fillna('NaN')

In [11]:
def translate(data_list):
    trans = Translation()
    chunk_len = len(data_list) // 100
    result = []
    print(f'Chunk length: {chunk_len}')
    for i in tqdm(range(100)):
        start = i * chunk_len
        stop = len(data_list) if i == 99 else (i + 1) * chunk_len
        chunk = data_list[start:stop]
        result.extend(trans.translate(chunk))
    return result

In [7]:
print('Translating......')
df['name'] = translate(df['name'].tolist())
print('name done.')
df['city'] = translate(df['city'].tolist())
print('city done.')
df['state'] = translate(df['state'].tolist())
print('state done.')
df['zip'] = translate(df['zip'].tolist())
print('zip done.')
df['country'] = translate(df['country'].tolist())
print('country done.')
df['categories'] = translate(df['categories'].tolist())
print('categories done.')
df

Translating......
Chunk length: 11388


100%|██████████| 100/100 [04:47<00:00,  2.88s/it]


name done.
Chunk length: 11388


100%|██████████| 100/100 [04:25<00:00,  2.65s/it]


city done.
Chunk length: 11388


100%|██████████| 100/100 [04:14<00:00,  2.55s/it]


state done.
Chunk length: 11388


100%|██████████| 100/100 [04:14<00:00,  2.55s/it]


zip done.
Chunk length: 11388


100%|██████████| 100/100 [03:46<00:00,  2.26s/it]


country done.
Chunk length: 11388


100%|██████████| 100/100 [04:19<00:00,  2.59s/it]

categories done.





Unnamed: 0,id,name,latitude,longitude,city,state,zip,country,categories,point_of_interest
0,E_000001272c6c5d,Cafe City Oudenaarde,50.859975,3.634196,Nederename,East Flanders,9700,BE,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,BR,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,karaage barber shop,13.780813,100.484900,,,,TH,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.844510,27.844202,,,,TR,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Cofino House Restaurant,43.338196,-4.326821,cavides,cantabria,,ES,Spanish Restaurants,P_809a884d4407fb
...,...,...,...,...,...,...,...,...,...,...
1138807,E_ffffb80854f713,Blue frog,35.659020,139.700780,Shibuya Ward,Tokyo,150-0043,JP,,P_7ccbeab96cd82e
1138808,E_ffffbf9a83e0ba,Deshon Place,40.872116,-79.945343,Butler,PA,16001,US,Housing Developments,P_db0abc418e7365
1138809,E_ffffc572b4d35b,Izmir Adnan Menderes Airport,38.423733,27.142826,İzmir,,,TR,Airport Services,P_ae96252a6a9380
1138810,E_ffffca745329ed,Yakiniku Japanese house,35.710712,139.775000,Taitō,Tokyo,110-0005,JP,BBQ Joints,P_146662f246d418


In [8]:
pair_df = pd.DataFrame(columns=pair1_cols + pair2_cols)
new_pair = pair_df.copy(deep=True)
pair_df

Unnamed: 0,name1,latitude1,longitude1,city1,state1,zip1,country1,categories1,name2,latitude2,longitude2,city2,state2,zip2,country2,categories2


In [9]:
counter = 0
for poi, poi_df in tqdm(df.groupby('point_of_interest')):
    if len(poi_df) > 2:
        poi_df = poi_df.drop(columns=['point_of_interest'])
        new_pair = new_pair[0:0]
        match_combs = np.array(list(combinations(poi_df['id'], 2)))
        pair1 = pd.concat([poi_df[poi_df['id'] == comb][cfg.CHOICE_FEATURE] for comb in match_combs[:, 0]], ignore_index=True)
        pair2 = pd.concat([poi_df[poi_df['id'] == comb][cfg.CHOICE_FEATURE] for comb in match_combs[:, 1]], ignore_index=True)
        new_pair[pair1_cols] = pair1
        new_pair[pair2_cols] = pair2
        pair_df = pd.concat([pair_df, new_pair], ignore_index=True)
        counter += len(match_combs)
print(f'Total combinations: {counter}')
print(f'Pair dataframe:\n {pair_df}')

100%|██████████| 739972/739972 [1:00:40<00:00, 203.28it/s]


Total combinations: 672781
Pair dataframe:
                                                     name1  latitude1  \
0       Taichung International Airport (RMQ) (Taichung...  24.254227   
1       Taichung International Airport (RMQ) (Taichung...  24.254227   
2       Taichung International Airport (RMQ) (Taichung...  24.254227   
3                         Taichung Airport Int'l Terminal  24.254209   
4                         Taichung Airport Int'l Terminal  24.254209   
...                                                   ...        ...   
672776                    Forest Near Lebedyanskiye Ponds  55.775839   
672777                                   Lebedyansky Pond  55.771589   
672778                              Southern Highway, 30a  56.227364   
672779                              Southern Highway, 30a  56.227364   
672780                                        Yuzhka, 30a  56.227045   

        longitude1            city1    state1   zip1 country1  \
0       120.599732        

## Build Training Data
1. How to build training data with positive / negative data?
    - Total Data points: 1138812
    - Total Data pairs: 672781
    - Unique Data points: 739972
    - For each data point, random sample 1 data point.

In [11]:
print(len(df['point_of_interest'].unique()))

739972


In [13]:
shuffle_df = df.sample(n=len(df), random_state=3, ignore_index=True)
shuffle_pairs = pd.concat([df[cfg.CHOICE_FEATURE], shuffle_df[cfg.CHOICE_FEATURE]], axis=1)
shuffle_pairs.columns = pair1_cols + pair2_cols
print(f'Number of same POI in df & shuffle_df: '
      f'{np.where(shuffle_df.point_of_interest == df.point_of_interest, True, False).sum()}')
print(shuffle_pairs)

Number of same POI in df & shuffle_df: 3
                                name1  latitude1  longitude1         city1  \
0                Cafe City Oudenaarde  50.859975    3.634196    Nederename   
1                      Carioca Manero -22.907225  -43.178244           NaN   
2                 karaage barber shop  13.780813  100.484900           NaN   
3                            Turkcell  37.844510   27.844202           NaN   
4             Cofino House Restaurant  43.338196   -4.326821       cavides   
...                               ...        ...         ...           ...   
1138807                     Blue frog  35.659020  139.700780  Shibuya Ward   
1138808                  Deshon Place  40.872116  -79.945343        Butler   
1138809  Izmir Adnan Menderes Airport  38.423733   27.142826         İzmir   
1138810       Yakiniku Japanese house  35.710712  139.775000         Taitō   
1138811                   Waihi Beach -37.417042  175.950466   Waihi Beach   

                state1

In [14]:
train_df = pd.concat([pair_df, shuffle_pairs], ignore_index=True)
labels = pd.DataFrame(np.concatenate((np.ones(len(pair_df)), -(np.ones(len(df))))), columns=['label'])
train_df.to_csv(cfg.TRAINING_PAIRS_PATH, index=False)
labels.to_csv(cfg.PAIRS_LABEL_PATH, index=False)

In [87]:
labels = pd.DataFrame(np.concatenate((np.ones(672781), (np.zeros(1138812)))), columns=['label'])
labels.to_csv(cfg.PAIRS_LABEL_PATH, index=False)

In [15]:
pair1_df = train_df[pair1_cols]
pair2_df = train_df[pair2_cols]
pair1_df.columns = cfg.CHOICE_FEATURE
pair2_df.columns = cfg.CHOICE_FEATURE
print(f'Pair 1: {pair1_df.head()}')
print(f'Pair 2: {pair2_df.head()}')
print(f'Labels: {labels}\nNumber of 1: {labels[labels == 1].sum()}\nNumber of -1: {-(labels[labels == -1].sum())}')

Pair 1:                                                 name   latitude   longitude  \
0  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
1  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
2  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
3                    Taichung Airport Int'l Terminal  24.254209  120.599844   
4                    Taichung Airport Int'l Terminal  24.254209  120.599844   

       city     state    zip country                categories  
0  Taichung  Taichung  43346      TW  Airports, Military Bases  
1  Taichung  Taichung  43346      TW  Airports, Military Bases  
2  Taichung  Taichung  43346      TW  Airports, Military Bases  
3       NaN       NaN    NaN      TW         Airport Terminals  
4       NaN       NaN    NaN      TW         Airport Terminals  
Pair 2:                               name   latitude   longitude    city state  zip  \
0  Taichung Airport Int'l Terminal  24.

## Processing Embeddings

### Loading raw datas

In [205]:
def loading_raw_data(slice=None):
    train_df = pd.read_csv(cfg.TRAINING_PAIRS_PATH)
    train_df = train_df.fillna('Null')
    labels = pd.read_csv(cfg.PAIRS_LABEL_PATH)

    train_df = train_df[:len(train_df) if slice is None else slice]
    labels = labels[:len(train_df) if slice is None else slice]

    pair1_df = train_df[pair1_cols]
    pair2_df = train_df[pair2_cols]
    pair1_df.columns = cfg.CHOICE_FEATURE
    pair2_df.columns = cfg.CHOICE_FEATURE
    print(f'Pair 1: {pair1_df.head()}')
    print(f'Pair 2: {pair2_df.head()}')
    print(f'Total pair number: {len(train_df)}')
    print(f'Labels: {labels}\nNumber of 1: {labels[labels == 1].sum()}\nNumber of -1: {-(labels[labels == -1].sum())}')
    return pair1_df, pair2_df, labels

pair1_df, pair2_df, labels = loading_raw_data()

  pair1_df, pair2_df, labels = loading_raw_data()


Pair 1:                                                 name   latitude   longitude  \
0  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
1  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
2  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
3                    Taichung Airport Int'l Terminal  24.254209  120.599844   
4                    Taichung Airport Int'l Terminal  24.254209  120.599844   

       city     state    zip country                categories  
0  Taichung  Taichung  43346      TW  Airports, Military Bases  
1  Taichung  Taichung  43346      TW  Airports, Military Bases  
2  Taichung  Taichung  43346      TW  Airports, Military Bases  
3      Null      Null   Null      TW         Airport Terminals  
4      Null      Null   Null      TW         Airport Terminals  
Pair 2:                               name   latitude   longitude    city state   zip  \
0  Taichung Airport Int'l Terminal  24

### Divide embedding into several chunks

In [31]:
def get_device():
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
    device = torch.device(device)
    return device

get_device()

device(type='cuda')

In [33]:
from sentence_transformers import SentenceTransformer

sen_model = SentenceTransformer('xlm-roberta-base', device=get_device())

Some weights of the model checkpoint at C:\Users\Gougon/.cache\torch\sentence_transformers\xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [203]:
def encode_features(model, chunk, pair1, pair2):
    pair1['text'] = pair1['name'] + '[SEP]' + pair1['city'] + '[SEP]' + pair1['state'] + '[SEP]' + \
                    pair1['country'] + '[SEP]' + pair1['categories']
    pair2['text'] = pair2['name'] + '[SEP]' + pair2['city'] + '[SEP]' + pair2['state'] + '[SEP]' + \
                    pair2['country'] + '[SEP]' + pair2['categories']
    pair1_sentence = pair1['text'].tolist()
    pair2_sentence = pair2['text'].tolist()
    print(pair1_sentence)
    embedding1 = model.encode(pair1_sentence)
    embedding2 = model.encode(pair2_sentence)
    if chunk != -1:
        pd.DataFrame(embedding1).to_csv(f'./data/embedding/sentence1_{str(chunk)}.csv', index=False)
        pd.DataFrame(embedding2).to_csv(f'./data/embedding/sentence2_{str(chunk)}.csv', index=False)
    return embedding1, embedding2

In [206]:
chunk_len = len(pair1_df) / 10000
for i in tqdm(range(10000)):
    start = int(i * chunk_len)
    stop = len(pair1_df) if i == 9999 else int((i + 1) * chunk_len)
    _, _ = encode_features(sen_model, i, pair1_df[start:stop], pair2_df[start:stop])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pair1['text'] = pair1['name'] + '[SEP]' + pair1['city'] + '[SEP]' + pair1['state'] + '[SEP]' + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pair2['text'] = pair2['name'] + '[SEP]' + pair2['city'] + '[SEP]' + pair2['state'] + '[SEP]' + \


['Taichung International Airport (RMQ) (Taichung International Airport)[SEP]Taichung[SEP]Taichung[SEP]TW[SEP]Airports, Military Bases', 'Taichung International Airport (RMQ) (Taichung International Airport)[SEP]Taichung[SEP]Taichung[SEP]TW[SEP]Airports, Military Bases', 'Taichung International Airport (RMQ) (Taichung International Airport)[SEP]Taichung[SEP]Taichung[SEP]TW[SEP]Airports, Military Bases', "Taichung Airport Int'l Terminal[SEP]Null[SEP]Null[SEP]TW[SEP]Airport Terminals", "Taichung Airport Int'l Terminal[SEP]Null[SEP]Null[SEP]TW[SEP]Airport Terminals", 'Taichung International Airport[SEP]Taipei[SEP]Null[SEP]TW[SEP]Airports', 'history office[SEP]Null[SEP]Null[SEP]RU[SEP]Schools', 'history office[SEP]Null[SEP]Null[SEP]RU[SEP]Schools', 'history office[SEP]Null[SEP]Null[SEP]RU[SEP]Schools', 'history office[SEP]Null[SEP]Null[SEP]RU[SEP]Schools', '👽👽👽[SEP]Null[SEP]Null[SEP]RU[SEP]Schools', '👽👽👽[SEP]Null[SEP]Null[SEP]RU[SEP]Schools', '👽👽👽[SEP]Null[SEP]Null[SEP]RU[SEP]Schools', 'phy

  0%|          | 1/10000 [00:00<2:10:51,  1.27it/s]

['21 Cinema Makassar Town Square[SEP]Null[SEP]Null[SEP]ID[SEP]Null', 'Cinema 21[SEP]Null[SEP]Null[SEP]ID[SEP]Multiplexes', 'Cinema 21[SEP]Null[SEP]Null[SEP]ID[SEP]Multiplexes', 'Cinema 21[SEP]Null[SEP]Null[SEP]ID[SEP]Multiplexes', 'Cinema 21[SEP]Null[SEP]Null[SEP]ID[SEP]Multiplexes', 'Cinema 21[SEP]Null[SEP]Null[SEP]ID[SEP]Multiplexes', 'Cinema 21[SEP]Null[SEP]Null[SEP]ID[SEP]Multiplexes', '21 Makassar Town Square[SEP]Null[SEP]Null[SEP]ID[SEP]Null', '21 Makassar Town Square[SEP]Null[SEP]Null[SEP]ID[SEP]Null', '21 Makassar Town Square[SEP]Null[SEP]Null[SEP]ID[SEP]Null', '21 Makassar Town Square[SEP]Null[SEP]Null[SEP]ID[SEP]Null', '21 Makassar Town Square[SEP]Null[SEP]Null[SEP]ID[SEP]Null', 'Cinema 21 M-Tos[SEP]Macassar[SEP]sulsel[SEP]ID[SEP]Multiplexes', 'Cinema 21 M-Tos[SEP]Macassar[SEP]sulsel[SEP]ID[SEP]Multiplexes', 'Cinema 21 M-Tos[SEP]Macassar[SEP]sulsel[SEP]ID[SEP]Multiplexes', 'Cinema 21 M-Tos[SEP]Macassar[SEP]sulsel[SEP]ID[SEP]Multiplexes', 'M&#39;Tos XXI[SEP]Makassar[SEP]South 

  0%|          | 2/10000 [00:01<2:07:22,  1.31it/s]

['İkizdere state hospital EMERGENCY[SEP]Null[SEP]Null[SEP]TR[SEP]Government Buildings', 'right emine sour district state hospital[SEP]Null[SEP]Null[SEP]TR[SEP]Emergency Rooms', 'Town Hall Zwevegem[SEP]Null[SEP]Null[SEP]BE[SEP]City Halls', 'Town Hall Zwevegem[SEP]Null[SEP]Null[SEP]BE[SEP]City Halls', 'Municipal point[SEP]Zwevegem[SEP]Null[SEP]BE[SEP]City Halls', 'Baggage claim[SEP]Null[SEP]Null[SEP]PE[SEP]Airports, Airport Lounges', 'Baggage claim[SEP]Null[SEP]Null[SEP]PE[SEP]Airports, Airport Lounges', 'baggage claim[SEP]Null[SEP]Null[SEP]PE[SEP]Airport Lounges', 'GVA Baggage Claim Belt 6[SEP]Grand Saconnex[SEP]GE[SEP]CH[SEP]Airports', 'GVA Baggage Claim Belt 6[SEP]Grand Saconnex[SEP]GE[SEP]CH[SEP]Airports', 'GVA Baggage Claim Belt 6[SEP]Grand Saconnex[SEP]GE[SEP]CH[SEP]Airports', 'GVA Baggage Claim Belt 9[SEP]Grand Saconnex[SEP]GE[SEP]CH[SEP]Airport Terminals', 'GVA Baggage Claim Belt 9[SEP]Grand Saconnex[SEP]GE[SEP]CH[SEP]Airport Terminals', 'GVA Baggage Claim[SEP]Grand Saconnex[SEP]

  0%|          | 3/10000 [00:02<2:04:43,  1.34it/s]

['Rumpun&#39;s Pancake[SEP]Null[SEP]Null[SEP]ID[SEP]Null', 'Butia Farm[SEP]Butia[SEP]Rio Grande do Sul[SEP]BR[SEP]Farms', 'Butia Farm[SEP]Butia[SEP]Rio Grande do Sul[SEP]BR[SEP]Farms', 'Butia Farm[SEP]Butia[SEP]Null[SEP]BR[SEP]Farms', 'Trip[SEP]Null[SEP]Null[SEP]TR[SEP]Bus Stops', 'Trip[SEP]Null[SEP]Null[SEP]TR[SEP]Bus Stops', 'parish[SEP]Null[SEP]Antalya[SEP]TR[SEP]Travel Agencies', 'Ice Palace &quot;Park of Legends&quot;[SEP]Moscow[SEP]Moscow[SEP]RU[SEP]Hockey Arenas', 'Ice Palace &quot;Park of Legends&quot;[SEP]Moscow[SEP]Moscow[SEP]RU[SEP]Hockey Arenas', 'Ice Palace &quot;Park of Legends&quot;[SEP]Moscow[SEP]Moscow[SEP]RU[SEP]Hockey Arenas', 'box 431[SEP]Null[SEP]Null[SEP]RU[SEP]Null', 'box 431[SEP]Null[SEP]Null[SEP]RU[SEP]Null', 'CSKA ARENA[SEP]Moscow[SEP]Moscow[SEP]RU[SEP]Hockey Rinks', 'Jcpenney Optical[SEP]Tampa[SEP]FL[SEP]US[SEP]Department Stores', 'Jcpenney Optical[SEP]Tampa[SEP]FL[SEP]US[SEP]Department Stores', 'JCPenney[SEP]Tampa[SEP]FL[SEP]US[SEP]Department Stores, Clothin

  0%|          | 3/10000 [00:03<2:59:23,  1.08s/it]


KeyboardInterrupt: 

### Save embedding chunks into single dataframe

In [33]:
merge_emb1 = pd.DataFrame(columns=[str(i) for i in range(768)])
merge_emb1.to_csv('./data/emb1.csv', index=False)
merge_emb2 = pd.DataFrame(columns=[str(i) for i in range(768)])
merge_emb2.to_csv('./data/emb2.csv', index=False)

for i in tqdm(range(10000)):
    emb1 = pd.read_csv(f'./data/embedding/sentence1_{i}.csv')
    emb2 = pd.read_csv(f'./data/embedding/sentence2_{i}.csv')
    emb1.to_csv('./data/emb1.csv', mode='a', index=False, header=False)
    emb2.to_csv('./data/emb2.csv', mode='a', index=False, header=False)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [49:38<00:00,  3.36it/s]


In [166]:
def save_batch_embedding(emb_df, pair_id):
    for i, chunk in tqdm(enumerate(emb_df)):
        chunk.to_csv(f'./data/batch_embedding/emb{str(pair_id)}_{str(i)}.csv', index=False)

emb1 = pd.read_csv('./data/emb1.csv', chunksize=cfg.BATCH_SIZE)
save_batch_embedding(emb1, 1)
emb2 = pd.read_csv('./data/emb2.csv', chunksize=cfg.BATCH_SIZE)
save_batch_embedding(emb2, 2)

28307it [32:23, 14.57it/s]
28307it [32:10, 14.66it/s]


## Training

### Import Deep learning packages

In [38]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch.utils.tensorboard import SummaryWriter
from sklearn.preprocessing import OneHotEncoder

### Initialize utils

In [6]:
def random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

random_seed(cfg.RAND_SEED)

In [202]:
def weights_init(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

### Data utils

In [8]:
from pynvml import *

torch.cuda.empty_cache()
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')

total    : 8589934592
free     : 8091746304
used     : 498188288


In [9]:
def make_train_test_indices(dataset):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(cfg.TEST_SIZE * dataset_size))
    np.random.shuffle(indices)
    return indices[split:], indices[:split]

In [10]:
def make_dataloader(dataset, indices, batch_size):
    sampler = SubsetRandomSampler(indices)
    return DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=8, drop_last=True)

### Define Matching dataset

In [167]:
class MatchingDataset(Dataset):
    def __init__(self, pair1, pair2, labels, chunksize=64):
        self.chunksize = chunksize
        self.colnames = [str(i) for i in range(768)]
        self.latitude1 = pair1['latitude'].to_numpy()
        self.longitude1 = pair1['longitude'].to_numpy()
        self.latitude2 = pair2['latitude'].to_numpy()
        self.longitude2 = pair2['longitude'].to_numpy()
        onehot_encoder = OneHotEncoder()
        onehot_encoder.fit([[0], [1]])
        labels = np.expand_dims(np.array(labels['label'].tolist()), axis=1)
        self.labels = onehot_encoder.transform(labels).toarray()

    def __getitem__(self, idx):
        start = idx * self.chunksize
        stop = (idx + 1) * self.chunksize

        emb1 = pd.read_csv(f'./data/batch_embedding/emb1_{str(idx)}.csv')
        emb2 = pd.read_csv(f'./data/batch_embedding/emb2_{str(idx)}.csv')

        return {
            'pair1': {
                'lat': torch.FloatTensor(np.expand_dims(self.latitude1[start:stop], axis=1)),
                'lng': torch.FloatTensor(np.expand_dims(self.longitude1[start:stop], axis=1)),
                'emb': torch.from_numpy(emb1.to_numpy()).to(torch.float32)
            },
            'pair2': {
                'lat': torch.FloatTensor(np.expand_dims(self.latitude2[start:stop], axis=1)),
                'lng': torch.FloatTensor(np.expand_dims(self.longitude2[start:stop], axis=1)),
                'emb': torch.from_numpy(emb2.to_numpy()).to(torch.float32)

            },
            'label': torch.from_numpy(self.labels[start:stop])
        }

    def __len__(self):
        return len(self.labels) // self.chunksize

### Build Network

In [85]:
class SimNetwork(nn.Module):
    def __init__(self):
        super(SimNetwork, self).__init__()

        self.s_encoder = nn.Sequential(
            nn.Linear(768, 512),
            nn.BatchNorm1d(512),
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU()
        )
        self.c_encoder = nn.Sequential(
            nn.Linear(2, 128),
            nn.BatchNorm1d(128),
            nn.Linear(128, 512),
            nn.BatchNorm1d(512),
            nn.ReLU()
        )
        self.sim_dis = nn.Sequential(
            nn.Linear(2048, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 2),
        )

    def forward(self, lat1, lng1, emb1, lat2, lng2, emb2):
        s1 = self.s_encoder(emb1)
        c1 = self.c_encoder(torch.cat((lat1, lng1), 1))
        s2 = self.s_encoder(emb2)
        c2 = self.c_encoder(torch.cat((lat2, lng2), 1))
        feat1 = torch.cat((s1, c1), 1)
        feat2 = torch.cat((s2, c2), 1)
        sim = self.sim_dis(torch.cat((feat1, feat2), 1))

        return sim

### Loading raw data

In [53]:
pair1_df, pair2_df, labels = loading_raw_data()

  pair1_df, pair2_df, labels = loading_raw_data()


Pair 1:                                                 name   latitude   longitude  \
0  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
1  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
2  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
3                    Taichung Airport Int'l Terminal  24.254209  120.599844   
4                    Taichung Airport Int'l Terminal  24.254209  120.599844   

       city     state    zip country                categories  
0  Taichung  Taichung  43346      TW  Airports, Military Bases  
1  Taichung  Taichung  43346      TW  Airports, Military Bases  
2  Taichung  Taichung  43346      TW  Airports, Military Bases  
3      Null      Null   Null      TW         Airport Terminals  
4      Null      Null   Null      TW         Airport Terminals  
Pair 2:                               name   latitude   longitude    city state   zip  \
0  Taichung Airport Int'l Terminal  24

### Start build train / test dataloader

In [168]:
matching_dataset = MatchingDataset(pair1_df, pair2_df, labels, chunksize=cfg.BATCH_SIZE)
len(matching_dataset)

28306

In [196]:
for i in matching_dataset:
    print(i['pair1']['lat'].shape)
    print(i['pair1']['lng'].shape)
    print(i['pair1']['emb'].shape)
    print(i['label'].shape)
    break

torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 768])
torch.Size([64, 2])


In [197]:
def train(model, indices, dataset, n_iter, optimizer, criterion, writer):
    model.train()
    for idx in tqdm(indices):
        batch = dataset[idx]
        lat1 = batch['pair1']['lat'].to(get_device())
        lng1 = batch['pair1']['lng'].to(get_device())
        emb1 = batch['pair1']['emb'].to(get_device())
        lat2 = batch['pair2']['lat'].to(get_device())
        lng2 = batch['pair2']['lng'].to(get_device())
        emb2 = batch['pair2']['emb'].to(get_device())
        label = batch['label'].to(get_device())

        optimizer.zero_grad()

        outputs = model(lat1, lng1, emb1, lat2, lng2, emb2)
        loss = criterion(outputs, label)
        loss.backward()

        optimizer.step()

        if n_iter % cfg.LOG_STEPS == 0:
            writer.add_scalar('train loss', loss, global_step=n_iter)

        n_iter += 1
    return n_iter

In [198]:
def test(model, indices, dataset, n_iter, criterion, writer):
    model.eval()
    total, corr_counter = 0, 0
    for idx in tqdm(indices):
        batch = dataset[idx]
        lat1 = batch['pair1']['lat'].to(get_device())
        lng1 = batch['pair1']['lng'].to(get_device())
        emb1 = batch['pair1']['emb'].to(get_device())
        lat2 = batch['pair2']['lat'].to(get_device())
        lng2 = batch['pair2']['lng'].to(get_device())
        emb2 = batch['pair2']['emb'].to(get_device())
        label = batch['label'].to(get_device())

        outputs = model(lat1, lng1, emb1, lat2, lng2, emb2)
        loss = criterion(outputs, label)

        total += label.size(0)
        pred = torch.sigmoid(outputs).data > 0.5
        corr_counter += (pred == label).sum().item()

        if n_iter % cfg.LOG_STEPS == 0:
            writer.add_scalar('test loss', loss, global_step=n_iter)

        n_iter += 1
    writer.add_scalar('accuracy', corr_counter / total, global_step=epoch_counter)
    return n_iter

In [80]:
model = SimNetwork().to(get_device())
model.apply(weights_init)
model

NameError: name 'weights_init' is not defined

In [200]:
optimizer = optim.Adam(model.parameters(), lr=cfg.LR, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.BCEWithLogitsLoss()

In [201]:
writer = SummaryWriter()
train_iter, test_iter = 0, 0
epoch = 10

train_indices, test_indices = make_train_test_indices(matching_dataset)
for epoch_counter in range(epoch):
    print(f'Epoch {epoch_counter}')
    train_iter = train(model, train_indices, matching_dataset, train_iter, optimizer, criterion, writer)
    test_iter = test(model, test_indices, matching_dataset, test_iter, criterion, writer)

    torch.save({
        'epoch': epoch_counter,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }, f'./checkpoint/checkpoint_{epoch_counter}.pt')

Epoch 0


100%|██████████| 22645/22645 [28:01<00:00, 13.46it/s]
100%|██████████| 5661/5661 [06:33<00:00, 14.38it/s]


Epoch 1


100%|██████████| 22645/22645 [28:21<00:00, 13.31it/s]
100%|██████████| 5661/5661 [06:27<00:00, 14.61it/s]


Epoch 2


100%|██████████| 22645/22645 [28:02<00:00, 13.46it/s]
100%|██████████| 5661/5661 [06:21<00:00, 14.83it/s]


Epoch 3


100%|██████████| 22645/22645 [27:59<00:00, 13.49it/s]
100%|██████████| 5661/5661 [06:24<00:00, 14.73it/s]


Epoch 4


100%|██████████| 22645/22645 [27:54<00:00, 13.52it/s]
100%|██████████| 5661/5661 [06:24<00:00, 14.72it/s]


Epoch 5


100%|██████████| 22645/22645 [27:48<00:00, 13.57it/s]
100%|██████████| 5661/5661 [06:21<00:00, 14.85it/s]


Epoch 6


100%|██████████| 22645/22645 [27:43<00:00, 13.61it/s]
100%|██████████| 5661/5661 [06:23<00:00, 14.78it/s]


Epoch 7


100%|██████████| 22645/22645 [27:45<00:00, 13.60it/s]
100%|██████████| 5661/5661 [06:23<00:00, 14.75it/s]


Epoch 8


100%|██████████| 22645/22645 [27:40<00:00, 13.64it/s]
100%|██████████| 5661/5661 [06:20<00:00, 14.89it/s]


Epoch 9


100%|██████████| 22645/22645 [27:43<00:00, 13.61it/s]
100%|██████████| 5661/5661 [06:22<00:00, 14.80it/s]


Epoch 10


100%|██████████| 22645/22645 [27:41<00:00, 13.63it/s]
100%|██████████| 5661/5661 [06:21<00:00, 14.83it/s]


Epoch 11


 17%|█▋        | 3794/22645 [04:45<23:36, 13.31it/s]


KeyboardInterrupt: 

### Import packages

In [187]:
import pandas as pd
import numpy as np
import torch

### Prepare testing data

In [188]:
test_df = pd.read_csv(cfg.TESTING_DATA_PATH)
test_df = test_df[['id'] + cfg.CHOICE_FEATURE]
test_df.head()

Unnamed: 0,id,name,latitude,longitude,city,state,zip,country,categories
0,E_00001118ad0191,Jamu Petani Bagan Serai,5.012169,100.535805,,,,MY,Cafés
1,E_000020eb6fed40,Johnny's Bar,40.434209,-80.56416,Weirton,WV,26062.0,US,Bars
2,E_00002f98667edf,QIWI,47.215134,39.686088,Ростов-на-Дону,,,RU,ATMs
3,E_001b6bad66eb98,"Gelora Sriwijaya, Jaka Baring Sport City",-3.014675,104.794374,,,,ID,Stadiums
4,E_0283d9f61e569d,Stadion Gelora Sriwijaya,-3.021727,104.788628,Palembang,South Sumatra,11480.0,ID,Soccer Stadiums


In [189]:
test_df = test_df.fillna('NaN')

In [190]:
# print('Translating......')
# test_df['name'] = translate(test_df['name'].tolist())
# print('name done.')
# test_df['city'] = translate(test_df['city'].tolist())
# print('city done.')
# test_df['state'] = translate(test_df['state'].tolist())
# print('state done.')
# test_df['country'] = translate(test_df['country'].tolist())
# print('country done.')
# test_df['categories'] = translate(test_df['categories'].tolist())
# print('categories done.')
# test_df

In [191]:
pair_df = pd.DataFrame(columns=pair1_cols + pair2_cols)
pair_df

Unnamed: 0,name1,latitude1,longitude1,city1,state1,zip1,country1,categories1,name2,latitude2,longitude2,city2,state2,zip2,country2,categories2


In [192]:
match_combs = np.array(list(combinations(test_df['id'], 2)))
pair1_df = pd.concat([test_df[test_df['id'] == comb][cfg.CHOICE_FEATURE] for comb in match_combs[:, 0]], ignore_index=True)
pair2_df = pd.concat([test_df[test_df['id'] == comb][cfg.CHOICE_FEATURE] for comb in match_combs[:, 1]], ignore_index=True)
print(f'Total combinations: {len(match_combs)}')
print(f'Pair1 dataframe:\n {pair1_df}')
print(f'Pair2 dataframe:\n {pair2_df}')

Total combinations: 10
Pair1 dataframe:
                                        name   latitude   longitude  \
0                   Jamu Petani Bagan Serai   5.012169  100.535805   
1                   Jamu Petani Bagan Serai   5.012169  100.535805   
2                   Jamu Petani Bagan Serai   5.012169  100.535805   
3                   Jamu Petani Bagan Serai   5.012169  100.535805   
4                              Johnny's Bar  40.434209  -80.564160   
5                              Johnny's Bar  40.434209  -80.564160   
6                              Johnny's Bar  40.434209  -80.564160   
7                                      QIWI  47.215134   39.686088   
8                                      QIWI  47.215134   39.686088   
9  Gelora Sriwijaya, Jaka Baring Sport City  -3.014675  104.794374   

             city state      zip country categories  
0             NaN   NaN      NaN      MY      Cafés  
1             NaN   NaN      NaN      MY      Cafés  
2             NaN   NaN   

In [193]:
emb1, emb2 = encode_features(sen_model, -1, pair1_df, pair2_df)
print(f'Embedding shape: {emb1.shape}')
print(f'Embedding 1:\n {emb1}')
print(f'Embedding 2:\n {emb2}')

Embedding shape: (10, 768)
Embedding 1:
 [[ 0.02706869  0.01938814  0.03482418 ...  0.01315501  0.02070352
   0.07002646]
 [ 0.02706869  0.01938814  0.03482418 ...  0.01315501  0.02070352
   0.07002646]
 [ 0.02706869  0.01938814  0.03482418 ...  0.01315501  0.02070352
   0.07002646]
 ...
 [ 0.04918833  0.02532699  0.01854805 ...  0.05378939  0.01470093
   0.16870056]
 [ 0.04918833  0.02532699  0.01854805 ...  0.05378939  0.01470093
   0.16870056]
 [ 0.01579969  0.01645643  0.01929748 ...  0.06166978 -0.0021403
   0.07021173]]
Embedding 2:
 [[ 0.02971117  0.02457738  0.02151311 ...  0.02526181 -0.01584446
   0.157363  ]
 [ 0.04918833  0.02532699  0.01854805 ...  0.05378939  0.01470093
   0.16870056]
 [ 0.01579969  0.01645643  0.01929748 ...  0.06166978 -0.0021403
   0.07021173]
 ...
 [ 0.01579969  0.01645643  0.01929748 ...  0.06166978 -0.0021403
   0.07021173]
 [ 0.03879216  0.01431718  0.02293392 ...  0.03597043 -0.03500376
   0.13208784]
 [ 0.03879216  0.01431718  0.02293392 ...  0.0

In [194]:
class TestingDataset(Dataset):
    def __init__(self, pair1, pair2, emb1, emb2):
        colnames = [str(i) for i in range(768)]
        self.latitude1 = pair1['latitude'].to_numpy()
        self.longitude1 = pair1['longitude'].to_numpy()
        self.latitude2 = pair2['latitude'].to_numpy()
        self.longitude2 = pair2['longitude'].to_numpy()
        self.emb1 = pd.DataFrame(emb1, columns=colnames)
        self.emb2 = pd.DataFrame(emb2, columns=colnames)

    def __getitem__(self, idx):
        return {
            'pair1': {
                'lat': torch.FloatTensor([[self.latitude1[idx]]]),
                'lng': torch.FloatTensor([[self.longitude1[idx]]]),
                'emb': torch.from_numpy(np.expand_dims(self.emb1.iloc[idx].to_numpy(), axis=0)).to(torch.float32)
            },
            'pair2': {
                'lat': torch.FloatTensor([[self.latitude2[idx]]]),
                'lng': torch.FloatTensor([[self.longitude2[idx]]]),
                'emb': torch.from_numpy(np.expand_dims(self.emb2.iloc[idx].to_numpy(), axis=0)).to(torch.float32)

            },
        }

    def __len__(self):
        return len(self.latitude1)

In [195]:
test_dataset = TestingDataset(pair1_df, pair2_df, emb1, emb2)
test_dataset[0]

{'pair1': {'lat': tensor([[5.0122]]),
  'lng': tensor([[100.5358]]),
  'emb': tensor([[ 2.7069e-02,  1.9388e-02,  3.4824e-02,  2.7274e-02,  3.6263e-02,
           -8.4528e-02, -3.5059e-03,  8.8995e-04, -3.6141e-02, -5.9687e-02,
            6.4635e-02,  6.0512e-02, -1.4402e-02,  2.2286e-02, -2.3299e-03,
           -7.9442e-02, -1.4890e-02,  2.4046e-04,  9.5566e-03,  2.9911e-02,
            3.8031e-02,  6.8702e-02,  2.2757e-01,  5.5782e-02, -5.1405e-02,
            3.1674e-02,  1.3058e-02,  3.2036e-02,  1.5951e-01,  8.6588e-02,
            3.4602e-02, -3.1425e-02,  1.0073e-01,  1.1952e-01,  4.8170e-02,
           -4.6384e-02, -2.0265e-02, -4.5911e-02,  1.3636e-02,  8.5152e-02,
            7.0258e-03,  2.2398e-02, -2.5222e-02,  1.5801e-02, -3.2597e-02,
           -7.5306e-03,  8.0205e-02,  3.7801e-02, -3.3195e-02,  1.0834e-02,
            6.8156e-02, -4.9644e-02,  5.1548e-02,  7.3441e-02, -4.2672e-02,
           -2.3501e-02,  1.2718e-02, -3.9010e-02, -3.4746e-02,  7.3993e-02,
            

### Start testing

In [196]:
state_dict = torch.load(cfg.MODEL_PATH, map_location='cpu')['state_dict']
print(state_dict)
model = SimNetwork()
model.load_state_dict(state_dict)
model.to(get_device())
model.eval()
model

OrderedDict([('s_encoder.0.weight', tensor([[ 0.0328, -0.0099, -0.1720,  ...,  0.0651, -0.0378,  0.0563],
        [ 0.2917, -0.0743,  0.2639,  ..., -0.0197,  0.1313, -0.0630],
        [ 0.0244, -0.1564,  0.1240,  ..., -0.1987, -0.0591, -0.0115],
        ...,
        [ 0.0884, -0.1067,  0.1797,  ...,  0.0154, -0.1908, -0.0361],
        [ 0.1634, -0.0113, -0.0691,  ..., -0.0389, -0.1535,  0.0757],
        [-0.0892,  0.0563, -0.1600,  ...,  0.0568, -0.0087,  0.0587]])), ('s_encoder.0.bias', tensor([-6.5633e-02, -6.5481e-02,  1.4343e-02, -4.4179e-02,  7.6229e-02,
         6.5635e-02,  1.8427e-02,  8.8780e-02, -5.6348e-02, -6.6311e-04,
        -5.6731e-02,  1.0968e-01,  2.0355e-02, -2.6985e-02, -2.4708e-02,
         1.9946e-02,  6.2098e-02,  4.5157e-02,  1.8844e-02,  2.9704e-02,
        -9.1812e-02, -1.0716e-01, -5.5875e-02,  1.2198e-01, -1.0501e-03,
        -7.8093e-02, -4.7570e-02, -5.5400e-02, -3.0016e-02,  1.9381e-02,
         7.9490e-02, -9.6159e-02, -1.2234e-02,  1.1252e-01,  4.9121e-

SimNetwork(
  (s_encoder): Sequential(
    (0): Linear(in_features=768, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): ReLU()
  )
  (c_encoder): Sequential(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=128, out_features=512, bias=True)
    (3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): ReLU()
  )
  (sim_dis): Sequential(
    (0): Linear(in_features=2048, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [197]:
match_df = pd.DataFrame(columns=['id', 'matches'])
match_df['id'] = test_df['id']
match_df['matches'] = test_df['id']
for i, sample in enumerate(test_dataset):
    lat1 = sample['pair1']['lat'].to(get_device())
    lng1 = sample['pair1']['lng'].to(get_device())
    emb1 = sample['pair1']['emb'].to(get_device())
    lat2 = sample['pair2']['lat'].to(get_device())
    lng2 = sample['pair2']['lng'].to(get_device())
    emb2 = sample['pair2']['emb'].to(get_device())

    outputs = model(lat1, lng1, emb1, lat2, lng2, emb2)
    pred = torch.sigmoid(outputs).data > 0.5
    if pred[0][1].item():
        pair1_id = match_combs[i][0]
        pair2_id = match_combs[i][1]
        match_df.loc[match_df['id'] == pair1_id, 'matches'] = \
            match_df.loc[match_df['id'] == pair1_id, 'matches'].astype(str) + ' ' + pair2_id
        match_df.loc[match_df['id'] == pair2_id, 'matches'] = \
            match_df.loc[match_df['id'] == pair2_id, 'matches'].astype(str) + ' ' + pair1_id
match_df.to_csv('submission.csv', index=False)