# Sim Network

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import combinations

from translate import Translation

In [3]:
class cfg:
    TRAINING_DATA_PATH = './data/train.csv'
    TRAINING_PAIRS_PATH = './data/train_pairs.csv'
    PAIRS_LABEL_PATH = './data/pairs_label.csv'
    CHOICE_FEATURE = ['name', 'latitude', 'longitude', 'city', 'state', 'zip', 'country', 'categories']

    RAND_SEED = 3
    TEST_SIZE = 0.2

    BATCH_SIZE = 64

## Prepare data

In [3]:
df = pd.read_csv(cfg.TRAINING_DATA_PATH)
df = df[['id'] + cfg.CHOICE_FEATURE + ['point_of_interest']]
# df = df[10000:20000].reset_index()
df.head()

Unnamed: 0,id,name,latitude,longitude,city,state,zip,country,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Nederename,Oost-Vlaanderen,9700.0,BE,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,BR,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.4849,,,,TH,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.84451,27.844202,,,,TR,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,Caviedes,Cantabria,,ES,Spanish Restaurants,P_809a884d4407fb


In [4]:
df = df.fillna('NaN')

In [6]:
def translate(data_list):
    trans = Translation()
    chunk_len = len(data_list) // 100
    result = []
    print(f'Chunk length: {chunk_len}')
    for i in tqdm(range(100)):
        start = i * chunk_len
        stop = len(data_list) if i == 99 else (i + 1) * chunk_len
        chunk = data_list[start:stop]
        result.extend(trans.translate(chunk))
    return result

In [7]:
print('Translating......')
df['name'] = translate(df['name'].tolist())
print('name done.')
df['city'] = translate(df['city'].tolist())
print('city done.')
df['state'] = translate(df['state'].tolist())
print('state done.')
df['zip'] = translate(df['zip'].tolist())
print('zip done.')
df['country'] = translate(df['country'].tolist())
print('country done.')
df['categories'] = translate(df['categories'].tolist())
print('categories done.')
df

Translating......
Chunk length: 11388


100%|██████████| 100/100 [04:47<00:00,  2.88s/it]


name done.
Chunk length: 11388


100%|██████████| 100/100 [04:25<00:00,  2.65s/it]


city done.
Chunk length: 11388


100%|██████████| 100/100 [04:14<00:00,  2.55s/it]


state done.
Chunk length: 11388


100%|██████████| 100/100 [04:14<00:00,  2.55s/it]


zip done.
Chunk length: 11388


100%|██████████| 100/100 [03:46<00:00,  2.26s/it]


country done.
Chunk length: 11388


100%|██████████| 100/100 [04:19<00:00,  2.59s/it]

categories done.





Unnamed: 0,id,name,latitude,longitude,city,state,zip,country,categories,point_of_interest
0,E_000001272c6c5d,Cafe City Oudenaarde,50.859975,3.634196,Nederename,East Flanders,9700,BE,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,BR,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,karaage barber shop,13.780813,100.484900,,,,TH,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.844510,27.844202,,,,TR,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Cofino House Restaurant,43.338196,-4.326821,cavides,cantabria,,ES,Spanish Restaurants,P_809a884d4407fb
...,...,...,...,...,...,...,...,...,...,...
1138807,E_ffffb80854f713,Blue frog,35.659020,139.700780,Shibuya Ward,Tokyo,150-0043,JP,,P_7ccbeab96cd82e
1138808,E_ffffbf9a83e0ba,Deshon Place,40.872116,-79.945343,Butler,PA,16001,US,Housing Developments,P_db0abc418e7365
1138809,E_ffffc572b4d35b,Izmir Adnan Menderes Airport,38.423733,27.142826,İzmir,,,TR,Airport Services,P_ae96252a6a9380
1138810,E_ffffca745329ed,Yakiniku Japanese house,35.710712,139.775000,Taitō,Tokyo,110-0005,JP,BBQ Joints,P_146662f246d418


In [8]:
pair1_cols = [feature + '1' for feature in cfg.CHOICE_FEATURE]
pair2_cols = [feature + '2' for feature in cfg.CHOICE_FEATURE]
pair_df = pd.DataFrame(columns=pair1_cols + pair2_cols)
new_pair = pair_df.copy(deep=True)
pair_df

Unnamed: 0,name1,latitude1,longitude1,city1,state1,zip1,country1,categories1,name2,latitude2,longitude2,city2,state2,zip2,country2,categories2


In [9]:
counter = 0
for poi, poi_df in tqdm(df.groupby('point_of_interest')):
    if len(poi_df) > 2:
        poi_df = poi_df.drop(columns=['point_of_interest'])
        new_pair = new_pair[0:0]
        match_combs = np.array(list(combinations(poi_df['id'], 2)))
        pair1 = pd.concat([poi_df[poi_df['id'] == comb][cfg.CHOICE_FEATURE] for comb in match_combs[:, 0]], ignore_index=True)
        pair2 = pd.concat([poi_df[poi_df['id'] == comb][cfg.CHOICE_FEATURE] for comb in match_combs[:, 1]], ignore_index=True)
        new_pair[pair1_cols] = pair1
        new_pair[pair2_cols] = pair2
        pair_df = pd.concat([pair_df, new_pair], ignore_index=True)
        counter += len(match_combs)
print(f'Total combinations: {counter}')
print(f'Pair dataframe:\n {pair_df}')

100%|██████████| 739972/739972 [1:00:40<00:00, 203.28it/s]


Total combinations: 672781
Pair dataframe:
                                                     name1  latitude1  \
0       Taichung International Airport (RMQ) (Taichung...  24.254227   
1       Taichung International Airport (RMQ) (Taichung...  24.254227   
2       Taichung International Airport (RMQ) (Taichung...  24.254227   
3                         Taichung Airport Int'l Terminal  24.254209   
4                         Taichung Airport Int'l Terminal  24.254209   
...                                                   ...        ...   
672776                    Forest Near Lebedyanskiye Ponds  55.775839   
672777                                   Lebedyansky Pond  55.771589   
672778                              Southern Highway, 30a  56.227364   
672779                              Southern Highway, 30a  56.227364   
672780                                        Yuzhka, 30a  56.227045   

        longitude1            city1    state1   zip1 country1  \
0       120.599732        

## Build Training Data
1. How to build training data with positive / negative data?
    - Total Data points: 1138812
    - Total Data pairs: 672781
    - Unique Data points: 739972
    - For each data point, random sample 1 data point.

In [11]:
print(len(df['point_of_interest'].unique()))

739972


In [4]:
pair1_cols = ['name1', 'latitude1', 'longitude1', 'city1', 'state1', 'zip1', 'country1', 'categories1']
pair2_cols = ['name2', 'latitude2', 'longitude2', 'city2', 'state2', 'zip2', 'country2', 'categories2']

In [13]:
shuffle_df = df.sample(n=len(df), random_state=3, ignore_index=True)
shuffle_pairs = pd.concat([df[cfg.CHOICE_FEATURE], shuffle_df[cfg.CHOICE_FEATURE]], axis=1)
shuffle_pairs.columns = pair1_cols + pair2_cols
print(f'Number of same POI in df & shuffle_df: '
      f'{np.where(shuffle_df.point_of_interest == df.point_of_interest, True, False).sum()}')
print(shuffle_pairs)

Number of same POI in df & shuffle_df: 3
                                name1  latitude1  longitude1         city1  \
0                Cafe City Oudenaarde  50.859975    3.634196    Nederename   
1                      Carioca Manero -22.907225  -43.178244           NaN   
2                 karaage barber shop  13.780813  100.484900           NaN   
3                            Turkcell  37.844510   27.844202           NaN   
4             Cofino House Restaurant  43.338196   -4.326821       cavides   
...                               ...        ...         ...           ...   
1138807                     Blue frog  35.659020  139.700780  Shibuya Ward   
1138808                  Deshon Place  40.872116  -79.945343        Butler   
1138809  Izmir Adnan Menderes Airport  38.423733   27.142826         İzmir   
1138810       Yakiniku Japanese house  35.710712  139.775000         Taitō   
1138811                   Waihi Beach -37.417042  175.950466   Waihi Beach   

                state1

In [14]:
train_df = pd.concat([pair_df, shuffle_pairs], ignore_index=True)
labels = pd.DataFrame(np.concatenate((np.ones(len(pair_df)), -(np.ones(len(df))))), columns=['label'])
train_df.to_csv(cfg.TRAINING_PAIRS_PATH, index=False)
labels.to_csv(cfg.PAIRS_LABEL_PATH, index=False)

In [15]:
pair1_df = train_df[pair1_cols]
pair2_df = train_df[pair2_cols]
pair1_df.columns = cfg.CHOICE_FEATURE
pair2_df.columns = cfg.CHOICE_FEATURE
print(f'Pair 1: {pair1_df.head()}')
print(f'Pair 2: {pair2_df.head()}')
print(f'Labels: {labels}\nNumber of 1: {labels[labels == 1].sum()}\nNumber of -1: {-(labels[labels == -1].sum())}')

Pair 1:                                                 name   latitude   longitude  \
0  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
1  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
2  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
3                    Taichung Airport Int'l Terminal  24.254209  120.599844   
4                    Taichung Airport Int'l Terminal  24.254209  120.599844   

       city     state    zip country                categories  
0  Taichung  Taichung  43346      TW  Airports, Military Bases  
1  Taichung  Taichung  43346      TW  Airports, Military Bases  
2  Taichung  Taichung  43346      TW  Airports, Military Bases  
3       NaN       NaN    NaN      TW         Airport Terminals  
4       NaN       NaN    NaN      TW         Airport Terminals  
Pair 2:                               name   latitude   longitude    city state  zip  \
0  Taichung Airport Int'l Terminal  24.

## Build DataLoader

### Import Deep learning packages

In [5]:
import os
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel

### Initialize utils

In [6]:
def random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

random_seed(cfg.RAND_SEED)

### Data utils

In [7]:
def make_train_test_indices(dataset):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(cfg.TEST_SIZE * dataset_size))
    np.random.shuffle(indices)
    return indices[split:], indices[:split]

In [8]:
def make_dataloader(dataset, indices, batch_size):
    sampler = SubsetRandomSampler(indices)
    return DataLoader(dataset, batch_size=batch_size, sampler=sampler, num_workers=8, drop_last=True)

### Define Matching dataset

In [56]:
class MatchingDataset(Dataset):
    def __init__(self, pair1, pair2, labels):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.model = BertModel.from_pretrained('bert-base-cased')
        self.pair1 = pair1
        self.pair2 = pair2
        self.labels = labels
        self.features = pair1.columns

    def _encode_features(self, features):
        pair1_input = [self.pair1[feat].tolist() for feat in features]
        pair1_encode = [self._encode(feats) for feats in pair1_input]
        pair2_input = [self.pair2[feat].tolist() for feat in features]
        pair2_encode = [self._encode(feats) for feats in pair2_input]
        self.pair1[features] = pd.DataFrame(pair1_encode)
        self.pair2[features] = pd.DataFrame(pair2_encode)

    def _get_features(self, idx, features):
        return self.pair1.iloc[idx][features], self.pair2.iloc[idx][features]

    def _encode(self, texts):
        print(texts[0])
        sent_token = [torch.IntTensor(self.tokenizer.encode(text)) for text in texts]
        sent_token_padding = pad_sequence(sent_token, batch_first=True)
        masks = torch.Tensor([[float(value > 0) for value in values] for values in sent_token_padding])
        output = self.model(sent_token_padding, attention_mask=masks)
        embedded = output[0][:, 0, :].squeeze()
        return embedded

    def __getitem__(self, idx):
        pair1_feat, pair2_feat = self._get_features(idx, ['name', 'city', 'state', 'zip', 'country', 'categories'])
        features = self._encode(pair1_feat.tolist() + pair2_feat.tolist())
        pair1_feat = features[:6]
        pair2_feat = features[6:]
        return {
            'pair1': {
                'lat': self.pair1.iloc[idx]['latitude'],
                'long': self.pair1.iloc[idx]['longitude'],
                'name': pair1_feat[0],
                'city': pair1_feat[1],
                'state': pair1_feat[2],
                'zip': pair1_feat[3],
                'country': pair1_feat[4],
                'categories': pair1_feat[5],
            },
            'pair2': {
                'lat': self.pair2.iloc[idx]['latitude'],
                'long': self.pair2.iloc[idx]['longitude'],
                'name': pair2_feat[0],
                'city': pair2_feat[1],
                'state': pair2_feat[2],
                'zip': pair2_feat[3],
                'country': pair2_feat[4],
                'categories': pair2_feat[5],
            },
            'label': int(self.labels.iloc[idx])
        }

    def __len__(self):
        return len(self.labels)

In [57]:
test_dataset = MatchingDataset(pair1_df, pair2_df, labels)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
item = test_dataset._encode_features(['name', 'city', 'state', 'zip', 'country', 'categories'])

Taichung International Airport (RMQ) (Taichung International Airport)


KeyboardInterrupt: 

### Loading raw data

In [10]:
train_df = pd.read_csv(cfg.TRAINING_PAIRS_PATH)
train_df = train_df.fillna('Null')
labels = pd.read_csv(cfg.PAIRS_LABEL_PATH)
pair1_df = train_df[pair1_cols]
pair2_df = train_df[pair2_cols]
pair1_df.columns = cfg.CHOICE_FEATURE
pair2_df.columns = cfg.CHOICE_FEATURE
print(f'Pair 1: {pair1_df.head()}')
print(f'Pair 2: {pair2_df.head()}')
print(f'Total pair number: {len(train_df)}')
print(f'Labels: {labels}\nNumber of 1: {labels[labels == 1].sum()}\nNumber of -1: {-(labels[labels == -1].sum())}')

  train_df = pd.read_csv(cfg.TRAINING_PAIRS_PATH)


Pair 1:                                                 name   latitude   longitude  \
0  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
1  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
2  Taichung International Airport (RMQ) (Taichung...  24.254227  120.599732   
3                    Taichung Airport Int'l Terminal  24.254209  120.599844   
4                    Taichung Airport Int'l Terminal  24.254209  120.599844   

       city     state    zip country                categories  
0  Taichung  Taichung  43346      TW  Airports, Military Bases  
1  Taichung  Taichung  43346      TW  Airports, Military Bases  
2  Taichung  Taichung  43346      TW  Airports, Military Bases  
3      Null      Null   Null      TW         Airport Terminals  
4      Null      Null   Null      TW         Airport Terminals  
Pair 2:                               name   latitude   longitude    city state   zip  \
0  Taichung Airport Int'l Terminal  24

### Start build train / test dataloader

In [None]:
matching_dataset = MatchingDataset(pair1_df, pair2_df, labels)
train_indices, test_indices = make_train_test_indices(matching_dataset)
train_loader = make_dataloader(matching_dataset, train_indices, cfg.BATCH_SIZE)
test_loader = make_dataloader(matching_dataset, test_indices, cfg.BATCH_SIZE)