In [3]:
import yt.wrapper as yt
from tqdm import tqdm

import os
import sys


sys.path.append('../')
sys.path.append('./')
sys.path.append('../proto/')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from embedder_api.config import S3_ENDPOINT, S3_BUCKET

from models.dssm import DSSM
from models.dataset_yt import DSSMDataset, collate_batch
from models.model_helper import train, get_default_device, get_optimizer

NUM_WORKERS = 16
torch.set_num_threads(NUM_WORKERS)


print(os.getenv('INPUT_PATH'))

os.environ['LOGS_PATH'] = "./logs"


# Параметры обучения
config = {
    'epochs': 3,
    'learning_rate': 0.01,
    'weight_decay': 0.01,
    'lr_step_size': 10,
    'lr_gamma': 0.9,
    'batch_size': 64,
    'offer_num_factors': ['CTR_HISTORY', 'EXPOSITION_TIME', 'FLOOR', 'FLOOR_RATIO', 'PREDICTED_PRICE', 'PREDICTED_PRICE_RATIO', 'PREVIOUS_PRICE', 'PREVIOUS_PRICE_RATIO', 'PRICE', 'PRICE_SQM', 'ROOMS_OFFERED', 'ROOMS_TOTAL', 'TOTAL_AREA', 'TOTAL_IMAGES', 'YEAR', 'BUILDING_FLOORS', 'COMISSION_PERCENTAGE', 'COMISSION_VALUE', 'DESCRIPTION_CAPS_WORDS_RATIO', 'DESCRIPTION_LENGTH', 'DESCRIPTION_POSSIBLE_LENGTH_RATIO', 'DESCRIPTION_PUNKTUATION_COUNT', 'DESCRIPTION_UNIQUE_CHARS', 'DESCRIPTION_UNIQUE_TOKENS', 'PREPAYMENT_PERCENTAGE', 'PREPAYMENT_VALUE', 'TIME_TO_CITY_CENTRE', 'TIME_TO_METRO_CAR', 'TIME_TO_METRO_FOOT', 'IMAGE_META_2_OF_6_0_MAIN', 'IMAGE_META_500PX_0_MAIN', 'IMAGE_META_AADB_AESTHETIC_0_MAIN', 'IMAGE_META_BAD_QUALITY_V2_MAIN', 'IMAGE_META_BAD_QUALITY_V3_MAIN', 'IMAGE_META_DOCS_WITH_PLANS_MAIN', 'IMAGE_META_DOCS_WO_PLANS_MAIN', 'IMAGE_META_ENTRANCE_STAIRS_MAIN', 'IMAGE_META_GOOD_QUALITY_V2_MAIN', 'IMAGE_META_GOOD_QUALITY_V3_MAIN', 'IMAGE_META_INTERIOR_MAIN', 'IMAGE_META_KITCHEN_MAIN', 'IMAGE_META_MAPS_MAIN', 'IMAGE_META_OK_QUALITY_V2_MAIN', 'IMAGE_META_OK_QUALITY_V3_MAIN', 'IMAGE_META_OTHER_MAIN', 'IMAGE_META_OUTSIDE_MAIN', 'IMAGE_META_PAIRWISE_ATTR_0_MAIN', 'IMAGE_META_QUALITY_IMAGE_COSMETIC_MAIN', 'IMAGE_META_QUALITY_IMAGE_EURO_MAIN', 'IMAGE_META_QUALITY_IMAGE_NEED_REPAIR_MAIN', 'IMAGE_META_QUALITY_IMAGE_OTHER_MAIN', 'IMAGE_META_SIMILAR_AWFUL_MAIN', 'IMAGE_META_SIMILAR_EURO_MAIN', 'IMAGE_META_SIMILAR_FRESH_MAIN', 'IMAGE_META_SIMILAR_NEED_REPAIR_MAIN', 'IMAGE_META_SPAM_MAIN', 'IMAGE_META_WC_MAIN', 'IMAGE_SIZE_X', 'IMAGE_SIZE_Y'],
    'query_num_factors': ['USER_CTR', 'USER_CTR_SELL', 'USER_CTR_RENT', 'SELL_RATIO', 'SELL_CLICKED_RATIO', 'RENT_RATIO', 'RENT_CLICKED_RATIO', 'VIEWED_SECONDARY_RATIO', 'VIEWED_NEW_RATIO', 'VIEWED_NEW_SECONDARY_RATIO', 'VIEWED_ROOM_RATIO_1', 'VIEWED_ROOM_RATIO_2', 'VIEWED_ROOM_RATIO_3', 'VIEWED_ROOM_4_RATIO_MORE', 'CLICKED_SECONDARY_RATIO', 'CLICKED_NEW_RATIO', 'CLICKED_NEW_SECONDARY_RATIO', 'CLICKED_ROOM_RATIO_1', 'CLICKED_ROOM_RATIO_2', 'CLICKED_ROOM_RATIO_3', 'CLICKED_ROOM_4_RATIO_MORE', 'CLICKED_RENT_RATIO', 'PHONE_SHOW_ROOM_RATIO_1', 'PHONE_SHOW_ROOM_RATIO_2', 'PHONE_SHOW_ROOM_RATIO_3', 'PHONE_SHOW_ROOM_4_RATIO_MORE', 'PHONE_SHOW_RATIO_SELL', 'PHONE_SHOW_CARD_RATIO_SELL', 'PHONE_SHOW_LISTING_RATIO_SELL', 'PHONE_SHOW_RATIO_RENT', 'PHONE_SHOW_CARD_RATIO_RENT', 'PHONE_SHOW_LISTING_RATIO_RENT', 'AVG_BUILDING_FLOORS_CLICKED', 'MAX_BUILDING_FLOORS_CLICKED', 'MIN_BUILDING_FLOORS_CLICKED', 'AVG_BUILDING_FLOORS_VIEWED', 'MAX_BUILDING_FLOORS_VIEWED', 'MIN_BUILDING_FLOORS_VIEWED', 'AVG_CTR_HISTORY_SELL', 'AVG_CTR_HISTORY_RENT', 'AVG_FLOOR_CLICKED', 'MAX_FLOOR_CLICKED', 'MIN_FLOOR_CLICKED', 'AVG_FLOOR_VIEWED', 'MAX_FLOOR_VIEWED', 'MIN_FLOOR_VIEWED', 'MIN_PRICE_SELL_VIEWED', 'MAX_PRICE_SELL_VIEWED', 'AVG_PRICE_SELL_VIEWED', 'MAX_PRICE_SELL_CLICKED', 'AVG_PRICE_SELL_CLICKED', 'MIN_PRICE_RENT_VIEWED', 'MAX_PRICE_RENT_VIEWED', 'AVG_PRICE_RENT_VIEWED', 'MAX_PRICE_RENT_CLICKED', 'AVG_PRICE_RENT_CLICKED', 'MIN_PRICE_SQM_SELL_VIEWED', 'MAX_PRICE_SQM_SELL_VIEWED', 'AVG_PRICE_SQM_SELL_VIEWED', 'AVG_PRICE_SQM_SELL_CLICKED', 'MIN_PRICE_SQM_RENT_VIEWED', 'MAX_PRICE_SQM_RENT_VIEWED', 'AVG_PRICE_SQM_RENT_VIEWED', 'AVG_PRICE_SQM_RENT_CLICKED', 'AVG_TOTAL_AREA_SELL_VIEWED', 'MAX_TOTAL_AREA_SELL_VIEWED', 'MIN_TOTAL_AREA_SELL_VIEWED', 'AVG_TOTAL_AREA_SELL_CLICKED', 'MAX_TOTAL_AREA_SELL_CLICKED', 'MIN_TOTAL_AREA_SELL_CLICKED', 'AVG_TOTAL_AREA_RENT_VIEWED', 'MAX_TOTAL_AREA_RENT_VIEWED', 'MIN_TOTAL_AREA_RENT_VIEWED', 'AVG_TOTAL_AREA_RENT_CLICKED', 'MAX_TOTAL_AREA_RENT_CLICKED', 'MIN_TOTAL_AREA_RENT_CLICKED', 'AVG_YEAR_SELL_VIEWED', 'MAX_YEAR_SELL_VIEWED', 'MIN_YEAR_SELL_VIEWED', 'AVG_YEAR_SELL_CLICKED', 'MAX_YEAR_SELL_CLICKED', 'MIN_YEAR_SELL_CLICKED', 'AVG_YEAR_RENT_VIEWED', 'MAX_YEAR_RENT_VIEWED', 'MIN_YEAR_RENT_VIEWED', 'AVG_YEAR_RENT_CLICKED', 'MAX_YEAR_RENT_CLICKED', 'MIN_YEAR_RENT_CLICKED'],
    "hidden_dim": 96,
    "embedding_dim": 32,
    "dropout_first": 0.2,
    "dropout_second": 0.1,
    'device': get_default_device(),
    'logging_folder': os.getenv('LOGS_PATH')
}


# YT
POOL_PATH_TRAIN = '//home/verticals/realty/ranking/dev_dssm_1000'
POOL_PATH_TEST = '//home/verticals/realty/ranking/dev_dssm_1000'



train_dataset = DSSMDataset(POOL_PATH_TRAIN)
test_dataset = DSSMDataset(POOL_PATH_TEST)

train_loader = DataLoader(train_dataset,
                          batch_size=config['batch_size'],
                          shuffle=False,
                          collate_fn=collate_batch)
test_loader = DataLoader(test_dataset,
                         batch_size=config['batch_size'],
                         shuffle=False,
                         collate_fn=collate_batch)

# print(1)
# model = DSSM(config).to(config['device'])
# opt = get_optimizer(model, lr=config['learning_rate'], wd=config['weight_decay'])
# scheduler = optim.lr_scheduler.StepLR(opt, step_size=config['lr_step_size'], gamma=config['lr_gamma'])

# criterion = nn.BCEWithLogitsLoss()

# train(model, opt, scheduler, criterion, train_loader, test_loader, config)


None


In [None]:
row = next(train_dataset.iterator)

In [1]:
import yt.wrapper as yt
from tqdm import tqdm

import os
import sys


sys.path.append('../')
sys.path.append('./')
sys.path.append('../proto/')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from embedder_api.config import S3_ENDPOINT, S3_BUCKET

from models.dssm import DSSM
from models.dataset_yt import DSSMDataset, collate_batch, collate_batch_query
from models.model_helper import train, get_default_device, get_optimizer, inference_query

NUM_WORKERS = 16
torch.set_num_threads(NUM_WORKERS)


print(os.getenv('INPUT_PATH'))

os.environ['LOGS_PATH'] = "./logs"


# Параметры обучения
config = {
    'epochs': 3,
    'learning_rate': 0.01,
    'weight_decay': 0.01,
    'lr_step_size': 10,
    'lr_gamma': 0.9,
    'batch_size': 64,
    'offer_num_factors': ['CTR_HISTORY', 'EXPOSITION_TIME', 'FLOOR', 'FLOOR_RATIO', 'PREDICTED_PRICE', 'PREDICTED_PRICE_RATIO', 'PREVIOUS_PRICE', 'PREVIOUS_PRICE_RATIO', 'PRICE', 'PRICE_SQM', 'ROOMS_OFFERED', 'ROOMS_TOTAL', 'TOTAL_AREA', 'TOTAL_IMAGES', 'YEAR', 'BUILDING_FLOORS', 'COMISSION_PERCENTAGE', 'COMISSION_VALUE', 'DESCRIPTION_CAPS_WORDS_RATIO', 'DESCRIPTION_LENGTH', 'DESCRIPTION_POSSIBLE_LENGTH_RATIO', 'DESCRIPTION_PUNKTUATION_COUNT', 'DESCRIPTION_UNIQUE_CHARS', 'DESCRIPTION_UNIQUE_TOKENS', 'PREPAYMENT_PERCENTAGE', 'PREPAYMENT_VALUE', 'TIME_TO_CITY_CENTRE', 'TIME_TO_METRO_CAR', 'TIME_TO_METRO_FOOT', 'IMAGE_META_2_OF_6_0_MAIN', 'IMAGE_META_500PX_0_MAIN', 'IMAGE_META_AADB_AESTHETIC_0_MAIN', 'IMAGE_META_BAD_QUALITY_V2_MAIN', 'IMAGE_META_BAD_QUALITY_V3_MAIN', 'IMAGE_META_DOCS_WITH_PLANS_MAIN', 'IMAGE_META_DOCS_WO_PLANS_MAIN', 'IMAGE_META_ENTRANCE_STAIRS_MAIN', 'IMAGE_META_GOOD_QUALITY_V2_MAIN', 'IMAGE_META_GOOD_QUALITY_V3_MAIN', 'IMAGE_META_INTERIOR_MAIN', 'IMAGE_META_KITCHEN_MAIN', 'IMAGE_META_MAPS_MAIN', 'IMAGE_META_OK_QUALITY_V2_MAIN', 'IMAGE_META_OK_QUALITY_V3_MAIN', 'IMAGE_META_OTHER_MAIN', 'IMAGE_META_OUTSIDE_MAIN', 'IMAGE_META_PAIRWISE_ATTR_0_MAIN', 'IMAGE_META_QUALITY_IMAGE_COSMETIC_MAIN', 'IMAGE_META_QUALITY_IMAGE_EURO_MAIN', 'IMAGE_META_QUALITY_IMAGE_NEED_REPAIR_MAIN', 'IMAGE_META_QUALITY_IMAGE_OTHER_MAIN', 'IMAGE_META_SIMILAR_AWFUL_MAIN', 'IMAGE_META_SIMILAR_EURO_MAIN', 'IMAGE_META_SIMILAR_FRESH_MAIN', 'IMAGE_META_SIMILAR_NEED_REPAIR_MAIN', 'IMAGE_META_SPAM_MAIN', 'IMAGE_META_WC_MAIN', 'IMAGE_SIZE_X', 'IMAGE_SIZE_Y'],
    'query_num_factors': ['USER_CTR', 'USER_CTR_SELL', 'USER_CTR_RENT', 'SELL_RATIO', 'SELL_CLICKED_RATIO', 'RENT_RATIO', 'RENT_CLICKED_RATIO', 'VIEWED_SECONDARY_RATIO', 'VIEWED_NEW_RATIO', 'VIEWED_NEW_SECONDARY_RATIO', 'VIEWED_ROOM_RATIO_1', 'VIEWED_ROOM_RATIO_2', 'VIEWED_ROOM_RATIO_3', 'VIEWED_ROOM_4_RATIO_MORE', 'CLICKED_SECONDARY_RATIO', 'CLICKED_NEW_RATIO', 'CLICKED_NEW_SECONDARY_RATIO', 'CLICKED_ROOM_RATIO_1', 'CLICKED_ROOM_RATIO_2', 'CLICKED_ROOM_RATIO_3', 'CLICKED_ROOM_4_RATIO_MORE', 'CLICKED_RENT_RATIO', 'PHONE_SHOW_ROOM_RATIO_1', 'PHONE_SHOW_ROOM_RATIO_2', 'PHONE_SHOW_ROOM_RATIO_3', 'PHONE_SHOW_ROOM_4_RATIO_MORE', 'PHONE_SHOW_RATIO_SELL', 'PHONE_SHOW_CARD_RATIO_SELL', 'PHONE_SHOW_LISTING_RATIO_SELL', 'PHONE_SHOW_RATIO_RENT', 'PHONE_SHOW_CARD_RATIO_RENT', 'PHONE_SHOW_LISTING_RATIO_RENT', 'AVG_BUILDING_FLOORS_CLICKED', 'MAX_BUILDING_FLOORS_CLICKED', 'MIN_BUILDING_FLOORS_CLICKED', 'AVG_BUILDING_FLOORS_VIEWED', 'MAX_BUILDING_FLOORS_VIEWED', 'MIN_BUILDING_FLOORS_VIEWED', 'AVG_CTR_HISTORY_SELL', 'AVG_CTR_HISTORY_RENT', 'AVG_FLOOR_CLICKED', 'MAX_FLOOR_CLICKED', 'MIN_FLOOR_CLICKED', 'AVG_FLOOR_VIEWED', 'MAX_FLOOR_VIEWED', 'MIN_FLOOR_VIEWED', 'MIN_PRICE_SELL_VIEWED', 'MAX_PRICE_SELL_VIEWED', 'AVG_PRICE_SELL_VIEWED', 'MAX_PRICE_SELL_CLICKED', 'AVG_PRICE_SELL_CLICKED', 'MIN_PRICE_RENT_VIEWED', 'MAX_PRICE_RENT_VIEWED', 'AVG_PRICE_RENT_VIEWED', 'MAX_PRICE_RENT_CLICKED', 'AVG_PRICE_RENT_CLICKED', 'MIN_PRICE_SQM_SELL_VIEWED', 'MAX_PRICE_SQM_SELL_VIEWED', 'AVG_PRICE_SQM_SELL_VIEWED', 'AVG_PRICE_SQM_SELL_CLICKED', 'MIN_PRICE_SQM_RENT_VIEWED', 'MAX_PRICE_SQM_RENT_VIEWED', 'AVG_PRICE_SQM_RENT_VIEWED', 'AVG_PRICE_SQM_RENT_CLICKED', 'AVG_TOTAL_AREA_SELL_VIEWED', 'MAX_TOTAL_AREA_SELL_VIEWED', 'MIN_TOTAL_AREA_SELL_VIEWED', 'AVG_TOTAL_AREA_SELL_CLICKED', 'MAX_TOTAL_AREA_SELL_CLICKED', 'MIN_TOTAL_AREA_SELL_CLICKED', 'AVG_TOTAL_AREA_RENT_VIEWED', 'MAX_TOTAL_AREA_RENT_VIEWED', 'MIN_TOTAL_AREA_RENT_VIEWED', 'AVG_TOTAL_AREA_RENT_CLICKED', 'MAX_TOTAL_AREA_RENT_CLICKED', 'MIN_TOTAL_AREA_RENT_CLICKED', 'AVG_YEAR_SELL_VIEWED', 'MAX_YEAR_SELL_VIEWED', 'MIN_YEAR_SELL_VIEWED', 'AVG_YEAR_SELL_CLICKED', 'MAX_YEAR_SELL_CLICKED', 'MIN_YEAR_SELL_CLICKED', 'AVG_YEAR_RENT_VIEWED', 'MAX_YEAR_RENT_VIEWED', 'MIN_YEAR_RENT_VIEWED', 'AVG_YEAR_RENT_CLICKED', 'MAX_YEAR_RENT_CLICKED', 'MIN_YEAR_RENT_CLICKED'],
    "hidden_dim": 96,
    "embedding_dim": 32,
    "dropout_first": 0.2,
    "dropout_second": 0.1,
    'device': get_default_device(),
    'logging_folder': os.getenv('LOGS_PATH')
}


# YT
POOL_PATH = '//home/verticals/realty/ranking/tmp/output1__iNan-UQPQCOkWibfLWiszA'



val_dataset = DSSMDataset(POOL_PATH)

val_loader = DataLoader(val_dataset,
                          batch_size=config['batch_size'],
                          shuffle=False,
                          collate_fn=collate_batch_query)
# print(1)
model = DSSM(config).to(config['device'])
# opt = get_optimizer(model, lr=config['learning_rate'], wd=config['weight_decay'])
# scheduler = optim.lr_scheduler.StepLR(opt, step_size=config['lr_step_size'], gamma=config['lr_gamma'])

# criterion = nn.BCEWithLogitsLoss()

# train(model, opt, scheduler, criterion, train_loader, test_loader, config)



None


In [2]:
res = inference_query(model, val_loader, config['device'])

 inference : 1452it [00:04, 303.82it/s]


In [5]:
len(res)

92924

In [19]:
res = {}

for c_id, vec in zip(crypta_ids, query_vec):
    res[c_id] = vec.tolist()

In [2]:
for row in val_loader:
    break

In [2]:
import sys
sys.path.append('../proto/')
from ads.bsyeti.libs.log_protos.universal_update_pb2 import TProfileUpdate
from yabs.proto.user_profile_pb2 import Profile
# from yabs.proto.keywords_pb2 import KI_REALTY_SITEID_VIEWED
KI_REALTY_VECTOR = 995

import datetime;
ts = 
print(ts)
# 1601158602.5595

1601275873.87575


In [13]:
ts = datetime.datetime.now().replace(microsecond=0).timestamp()
ts

1601276129.0

In [11]:
ts = int(datetime.datetime.now().timestamp())

item = Profile.ProfileItem()
item.keyword_id = KI_REALTY_VECTOR
item.string_value = b"lolkekcheburek"
item.update_time = ts

profile = Profile()
profile.items.append(item)

command = TProfileUpdate.TCommand()
command.add.CopyFrom(profile)

msg = TProfileUpdate()
msg.crypta_id = 123
msg.commands.append(command)
msg.timestamp = ts

In [12]:
msg

commands {
  add {
    items {
      keyword_id: 995
      update_time: 1601276063
      string_value: "lolkekcheburek"
    }
  }
}
timestamp: 1601276063
crypta_id: 123

In [22]:
message = msg.SerializeToString()

In [19]:
value = bytes(";".join([str(x) for x in [1.00001,-2.13412]]), 'utf8')

In [44]:
msg.DESCRIPTOR.SerializeToString()

AttributeError: 'google.protobuf.pyext._message.MessageDescriptor' object has no attribute 'SerializeToString'

In [45]:
from google.protobuf.pyext._message import MessageDe

In [61]:
from ads.bsyeti.libs.log_protos.universal_update_pb2 import DESCRIPTOR as universal_update_pb2_descr


from ads.bsyeti.libs.log_protos.universal_update_pb2 import yabs_dot_proto_dot_user__profile__pb2

In [56]:
universal_update_pb2_descr.serialized_pb

b'\n1ads/bsyeti/libs/log_protos/universal_update.proto\x12\rNBSYeti.NLogs\x1a\x1dyabs/proto/user_profile.proto"\xfb\x02\n\x0eTProfileUpdate\x128\n\x08commands\x18\x01 \x03(\x0b2&.NBSYeti.NLogs.TProfileUpdate.TCommand\x12\x11\n\ttimestamp\x18\x02 \x01(\x04\x12\x14\n\nyandex_uid\x18\x03 \x01(\x04H\x00\x12\x16\n\x0cpassport_uid\x18\x04 \x01(\x04H\x00\x12\x0e\n\x04idfa\x18\x05 \x01(\tH\x00\x12\x0e\n\x04gaid\x18\x06 \x01(\tH\x00\x12\x0e\n\x04duid\x18\x07 \x01(\x04H\x00\x12\x13\n\tcrypta_id\x18\x08 \x01(\x04H\x00\x12\x0e\n\x04oaid\x18\t \x01(\tH\x00\x1a\x8d\x01\n\x08TCommand\x12"\n\x03add\x18\x01 \x01(\x0b2\x13.yabs.proto.ProfileH\x00\x12%\n\x06remove\x18\x02 \x01(\x0b2\x13.yabs.proto.ProfileH\x00\x12,\n\radd_secondary\x18\x03 \x01(\x0b2\x13.yabs.proto.ProfileH\x00B\x08\n\x06actionB\t\n\x07user_id'

In [75]:
import gzip

f = gzip.open('./file.txt.gz', 'wb')
f.write(content)
f.close()

'ads/bsyeti/libs/log_protos/universal_update.proto'

In [78]:
content = universal_update_pb2_descr.serialized_pb

In [9]:
import torch
import torch.optim as optim
from torch.utils.data import IterableDataset, DataLoader
# from torch.utils.data import DataLoader

import datetime
import json
import os
from tqdm import tqdm
import sys
import math

import yt.wrapper as yt

yt.config["proxy"]["url"] = "hahn"
yt.config["read_parallel"]["enable"] = True
yt.config["remote_temp_tables_directory"] = "//home/verticals/realty/ranking/tmp"

In [58]:
class DSSMDataset(IterableDataset):
    def __init__(self, mr_table_json):
        super(DSSMDataset).__init__()

        self.mr_json = json.loads(mr_table_json)
        self.mr_table = self.mr_json['table']

        self.start = 0
        self.end = 1000

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        print(worker_info)
        if worker_info is None:  # single-process data loading, return the full iterator
            iter_start = self.start
            iter_end = self.end
        else:  # in a worker process
            # split workload
            per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
            worker_id = worker_info.id
            iter_start = self.start + worker_id * per_worker
            iter_end = min(iter_start + per_worker, self.end)

        return iter(range(iter_start, iter_end)) #yt.read_table(yt.TablePath(self.pool_path, start_index=iter_start, end_index=iter_end))

class MyIterableDataset(IterableDataset):
    def __init__(self, start, end):
        super(MyIterableDataset).__init__()
        assert end > start, "this example code only works with end >= start"
        self.start = start
        self.end = end

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:  # single-process data loading, return the full iterator
            iter_start = self.start
            iter_end = self.end
        else:  # in a worker process
            # split workload
            per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
            worker_id = worker_info.id
            iter_start = self.start + worker_id * per_worker
            iter_end = min(iter_start + per_worker, self.end)
        return iter(range(iter_start, iter_end))

In [62]:
ds = MyIterableDataset(start=0, end=7)

# >>> # Single-process loading
train_loader = DataLoader(ds, num_workers=10, batch_size=4)
# [3, 4, 5, 6]

res = []
for batch in train_loader:
    res.append(batch)

res

RuntimeError: DataLoader worker (pid(s) 47472, 47474) exited unexpectedly

In [48]:
torch.set_num_threads(16)

train_dataset = DSSMDataset('{"table": "//home/verticals/realty/ranking/dev_dssm_1000","cluster": "hahn"}')
test_dataset = DSSMDataset('{"table": "//home/verticals/realty/ranking/dev_dssm_1000","cluster": "hahn"}')

train_loader = DataLoader(train_dataset,
                          batch_size=3,
                          shuffle=False,
                          num_workers=1
#                           collate_fn=collate_batch,
#                           nu
                         )
test_loader = DataLoader(test_dataset,
                         batch_size=16,
                         shuffle=False,)
#                          collate_fn=collate_batch)

In [49]:
res = []
for batch in train_loader:
    res.append(batch)

RuntimeError: DataLoader worker (pid(s) 47248) exited unexpectedly

In [31]:
res

[tensor([0, 1, 2]),
 tensor([3, 4, 5]),
 tensor([6, 7, 8]),
 tensor([ 9, 10, 11]),
 tensor([12, 13, 14]),
 tensor([15, 16, 17]),
 tensor([18, 19, 20]),
 tensor([21, 22, 23]),
 tensor([24, 25, 26]),
 tensor([27, 28, 29]),
 tensor([30, 31, 32]),
 tensor([33, 34, 35]),
 tensor([36, 37, 38]),
 tensor([39, 40, 41]),
 tensor([42, 43, 44]),
 tensor([45, 46, 47]),
 tensor([48, 49, 50]),
 tensor([51, 52, 53]),
 tensor([54, 55, 56]),
 tensor([57, 58, 59]),
 tensor([60, 61, 62]),
 tensor([63, 64, 65]),
 tensor([66, 67, 68]),
 tensor([69, 70, 71]),
 tensor([72, 73, 74]),
 tensor([75, 76, 77]),
 tensor([78, 79, 80]),
 tensor([81, 82, 83]),
 tensor([84, 85, 86]),
 tensor([87, 88, 89]),
 tensor([90, 91, 92]),
 tensor([93, 94, 95]),
 tensor([96, 97, 98]),
 tensor([ 99, 100, 101]),
 tensor([102, 103, 104]),
 tensor([105, 106, 107]),
 tensor([108, 109, 110]),
 tensor([111, 112, 113]),
 tensor([114, 115, 116]),
 tensor([117, 118, 119]),
 tensor([120, 121, 122]),
 tensor([123, 124, 125]),
 tensor([126, 12

In [23]:
yt.create_temp_table(path=None, prefix=None, expiration_timeout=12*60*60*1000)

'//home/verticals/realty/ranking/tmp/SdjWhKmQa9'

In [27]:
string = '{"table": "//home/verticals/realty/ranking/dev_dssm_1000", "cluster": "hahn"}'

In [29]:
json.loads(string)['table']

'//home/verticals/realty/ranking/dev_dssm_1000'

In [34]:
# schema = [
#         {"name": "universal_update", "type": "uint64"},
#     ]

# yt.create("table", '//home/verticals/realty/ranking/tmp/uint64test', attributes={"schema": schema})



yt.write_table('//home/verticals/realty/ranking/tmp/uint64test', 
               ({"universal_update": _} for _ in range(100)))

#     table_path = f"//home/verticals/__private_export/bigb-realty-embedding/{current_time}"
#     yt.copy(temp_table, table_path, recursive=True)