In [1]:
# https://discuss.pytorch.org/t/i-have-3-gpu-why-torch-cuda-device-count-only-return-1/7245/4
import torch
import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
from subprocess import call
# call(["nvcc", "--version"]) does not work
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())

print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())

__Python VERSION: 3.8.3 (default, May 19 2020, 18:47:26) 
[GCC 7.3.0]
__pyTorch VERSION: 1.7.0
__CUDA VERSION
/usr/bin/sh: 1: nvcc: not found
__CUDNN VERSION: 8003
__Number CUDA Devices: 2
__Devices
Active CUDA Device: GPU 0
Available devices  2
Current cuda device  0


In [2]:
%load_ext autoreload
%autoreload 2

import os
import json

import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

import wandb
wandb.login()
from pytorch_lightning.loggers import WandbLogger

from dataraw_sampling import sample_dataset, plot_sampled_distribution
from dataraw_full_matrix import gen_card_data, report_gamedata_distribution
from util_distribution import plot_distribution
from dataset import GameDatasetFromFullMatrix, GameTestFullDataset, GameDatasetTrainDataset
from datamodule import GameDataModule
from model import construct_full_model
from trainmodule import TrainModule

[34m[1mwandb[0m: Currently logged in as: [33mchucooleg[0m (use `wandb login --relogin` to force relogin)


ModuleNotFoundError: No module named 'model'

In [None]:
# arguments
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
args = parser.parse_args('')

args.generate_full_matrix = False
args.data_PATH = '../Raw_Datasets/3Attr-4Vals-2hat23Train-2hat12Val.json'
args.config_PATH = 'config.json'

In [None]:
# generate on the fly -- becareful of sampling time
# num_attributes = 2
# num_attr_vals = 3
# game_data = sample_dataset(num_attributes, num_attr_vals, N_train=10000, N_val=1000)

# read from disk instead
with open(args.data_PATH, 'r') as f:
    game_data = json.load(f)

print('----------data----------')
for k in game_data:
    if not 'datapoints' in k:
        print(k,':', game_data[k])
    else:
        print(k,'length :', len(game_data[k]))
print('------------------------')

In [5]:
# read from disk instead
with open(args.config_PATH, 'r') as f:
    hparams = json.load(f)

hparams['key_support_size'] = game_data['key_support_size']
hparams['query_support_size'] = game_data['query_support_size']
hparams['num_attributes'] = game_data['num_attributes']
hparams['num_attr_vals'] = game_data['num_attr_vals']
hparams['populate_logits_matrix'] = args.generate_full_matrix
if 'embedding_by_property' not in params:
    hparams['populate_logits_matrix'] = False    
    
print('----------hparams----------')
for k in hparams:
    print(k, ':', hparams[k])
print('---------------------------')

----------hparams----------
seed : 42
batch_size : 128
max_epochs : 1
d_model : 128
embed_dropout : 0.0
dotproduct_bottleneck : True
loss_temperature_const : 1.0
lr : 0.001
adam_beta1 : 0.9
adam_beta2 : 0.999
adam_epsilon : 1e-08
warmup_steps : 12000
adam_weight_decay : 0
gradient_clip_val : 0
debug : False
key_support_size : 4097
query_support_size : 16777216
num_attributes : 4
num_attr_vals : 8
populate_logits_matrix : False
---------------------------


In [6]:
# main() do this only if data is small
if args.generate_full_matrix:
    print('Generating Full Matrix')
    game_data_full = gen_card_data(hparams['num_attributes'], hparams['num_attr_vals'], num_unseen_cardpairs=0, debug=False)
    count_table, xy, xyind, xy_div_xyind, distribution = report_gamedata_distribution(game_data_full, distribution_epsilon=0.0)
    gt = {
        'count_table':count_table,
        'xy':xy,
        'xyind':xyind,
        'xy_div_xyind':xy_div_xyind,
        'distribution':distribution
    }
    print(distribution)
else:
    gt = None

In [7]:
# main()

pl.seed_everything(hparams['seed'])

# model
trainmodule =  TrainModule(hparams, gt_distributions=gt if hparams['populate_logits_matrix'] else {})
model_summary = pl.core.memory.ModelSummary(trainmodule, mode='full')
print(model_summary,'\n')

# dataset
game_datamodule = GameDataModule(
    batch_size = hparams['batch_size'],
    raw_data = game_data,
    debug=hparams['debug']
)

# testloader
test_loader = DataLoader(
            GameTestFullDataset(raw_data=game_data, debug=hparams['debug']), 
            batch_size=hparams['batch_size'], shuffle=False
        )

# logger
run_name = 'CardGame:OR;attr{}-val{};d_model{};{};params{}K'.format(
    hparams['num_attributes'], hparams['num_attr_vals'], 
    hparams['d_model'],  
    'dot-product' if hparams['dotproduct_bottleneck'] else '',
    round(max(model_summary.param_nums)/1000,2))
project_name = 'ContrastiveLearning-cardgame-Scaling-SecondPass'
wd_logger = WandbLogger(name=run_name, project=project_name)
print('RUN NAME :\n', run_name)

# check point path
ckpt_dir_PATH = os.path.join('checkpoints', project_name, run_name)
os.makedirs(ckpt_dir_PATH, exist_ok=True)

Global seed set to 42


   | Name                                         | Type               | Params
-------------------------------------------------------------------------------------
0  | model                                        | EncoderPredictor   | 2.1 B 
1  | model.inp_query_layer                        | Sequential         | 2.1 B 
2  | model.inp_query_layer.scaled_embed           | ScaledEmbedding    | 2.1 B 
3  | model.inp_query_layer.scaled_embed.embedding | Embedding          | 2.1 B 
4  | model.inp_query_layer.embed_dropout          | Dropout            | 0     
5  | model.inp_key_layer                          | Sequential         | 524 K 
6  | model.inp_key_layer.scaled_embed             | ScaledEmbedding    | 524 K 
7  | model.inp_key_layer.scaled_embed.embedding   | Embedding          | 524 K 
8  | loss_criterion                               | InfoCELoss         | 0     
9  | loss_criterion.CE_loss                       | CrossEntropyLoss   | 0     
10 | metrics                      

## run training

In [10]:
# main()
run_train(args, hparams, trainmodule, game_datamodule, ckpt_dir_PATH, wd_logger)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  with torch.autograd.detect_anomaly():
[34m[1mwandb[0m: wandb version 0.10.19 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



   | Name                                         | Type               | Params
-------------------------------------------------------------------------------------
0  | model                                        | EncoderPredictor   | 2.1 B 
1  | model.inp_query_layer                        | Sequential         | 2.1 B 
2  | model.inp_query_layer.scaled_embed           | ScaledEmbedding    | 2.1 B 
3  | model.inp_query_layer.scaled_embed.embedding | Embedding          | 2.1 B 
4  | model.inp_query_layer.embed_dropout          | Dropout            | 0     
5  | model.inp_key_layer                          | Sequential         | 524 K 
6  | model.inp_key_layer.scaled_embed             | ScaledEmbedding    | 524 K 
7  | model.inp_key_layer.scaled_embed.embedding   | Embedding          | 524 K 
8  | loss_criterion                               | InfoCELoss         | 0     
9  | loss_criterion.CE_loss                       | CrossEntropyLoss   | 0     
10 | metrics                     

Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]



Validation sanity check:  50%|█████     | 1/2 [00:20<00:20, 20.26s/it]



Epoch 0:   0%|          | 0/67616 [00:00<?, ?it/s]                    



Epoch 0:   0%|          | 0/67616 [00:24<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 8.00 GiB (GPU 0; 23.70 GiB total capacity; 16.00 GiB already allocated; 5.80 GiB free; 16.02 GiB reserved in total by PyTorch)

## resume training

In [None]:
resume_train(args, hparams, project_name, '1ih8yza5', trainmodule, game_datamodule, ckpt_dir_PATH, wd_logger)