This notebook explores the pop_synth and job_synth data 

In [1]:
import os 
os.environ['LD_LIBRARY_PATH'] = ':'.join([os.environ.get('LD_LIBRARY_PATH', ''), '/usr/local/cuda-9.0/lib64/'])
os.environ['PATH'] = ':'.join([os.environ.get('PATH', ''), '/usr/local/cuda-9.0/lib64/'])

In [2]:
### Importing the basic modules

#### First part: Analysis and data pre-processing
import os           # Working directory
import pandas as pd # Data processing
import matplotlib
import matplotlib.pyplot as plt # Common graphing interface (check also plotly and plotnine)

#### Second part: models and Hyperparameter optimization
import tensorflow as tf
import keras 
import numpy as np
from functools import partial

from keras.activations import relu, softmax 
from keras.callbacks import LearningRateScheduler, EarlyStopping
from keras.layers import Activation, BatchNormalization, Concatenate, concatenate, Dense, Dropout, Input, InputLayer, Lambda, LeakyReLU
#from keras.layers.merge import _Merge
from keras.losses import mse, binary_crossentropy, categorical_crossentropy, mean_squared_error
import keras.metrics as metrics
from keras.models import Model, Sequential
from keras.utils import plot_model
from keras import backend as K
from keras import metrics

#### Third part: model validation and graphs
import geopandas as gpd

# Bayesian optimization modules
#import GPy, GPyOpt

#### Extra
import random as rn

# Personal modules
import TUutils
import validationUtils
import plotUtils
from wganModel import WGAN

2023-10-29 15:36:31.897705: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-29 15:36:31.930503: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-29 15:36:31.930545: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-29 15:36:31.930564: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-29 15:36:31.936471: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-29 15:36:31.937236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Random number seeds
np.random.seed(42) # Numpy
rn.seed(12345) # Python
tf.random.set_seed(1234) # TenforFlow

In [5]:
# prevent tensorflow from allocating the entire GPU memory at once
#from keras.backend.tensorflow_backend import set_session
#from keras.backend import set_session
config = tf.compat.v1.ConfigProto()
#"""
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = "0"
#session = tf.Session(config=config)
#"""
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

2023-10-29 15:36:37.639939: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-29 15:36:37.640577: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [6]:
### Changing the working directory
#os.chdir('C:/Users/shgm/Desktop/projects/job_VAE') # PC
#os.chdir('/mnt/sdb1/data_shgm/') # Server 32
os.chdir('/home/s212945/snap/snapd-desktop-integration/current/Documents/Thesis/codeSergio/data')
#print(os.getcwd())

# Basic tasks
Now I will lay down the process of the VAE. I will be working on this steps and checking them accordingly.
- One-hot encoding categorical variables (OK) and standarizing numerical variables (OK).
- Loss functions.
    - Cost. (OK)
    - KL.   (OK)
- Encoder/Decoder architecture (remember they are usually symmetric)
- Model trainer (Including methodology)
- Hyper-parameter tuning (Think about bayesian optimization, also think that for this purpose, the steps mentioned before need to be defined as a function that returns a cost!!!!!!)


# Extra tasks
- After working this out, ideally we could also try the fine-tuning approach
- Try also the conditional VAE CVAE

### Dataset constructed in merge_check

In [7]:
# Data types for data loading:
df_dtypes = {'IncRespondent2000' : 'int32', 'TotalLenExclComTrans' : 'int32', 'TotalMotorLen' : 'int32', 'TotalBicLen' : 'int32', 'TotalMin' : 'int32', 'TotalMotorMin' : 'int32', 
                        'WorkHoursPw' : 'int32', 'GISdistHW' : 'int32', 'HousehNumPers1084' : 'int32', 'TotalNumTrips' : 'int32', 'NumTripsCorr' : 'int32', 'NumTripsExclComTrans' : 'int32', 
                        'SessionWeight' : 'int32', 'HomeAdrCitySize' : 'int32', 'HomeAdrDistNearestStation' : 'int32', 'HwDayspW' : 'int32', 'WorkatHomeDayspM' : 'int32', 'JstartDistNearestStation' : 'int32', 
                        'NightsAway' : 'int32', 'SessionId' : 'int32', 'Sector' : 'category', 'MunicipalityOrigin' : 'category', 'edu' : 'category', 'PopSocio' : 'category', 'Gender' : 'category', 
                        'Year' : 'category', 'InterviewType' : 'category', 'PseudoYear' : 'category', 'DiaryMonth' : 'category', 'DiaryWeekday' : 'category', 'DiaryDaytype' : 'category', 
                        'HomeAdrNUTS' : 'category', 'HomeAdrCityCode' : 'category', 'HomeAdrNearestStation' : 'category', 'HomeParkPoss' : 'category', 'RespAgeCorrect' : 'int32', 
                        'MunicipalityDest' : 'category', 'WorkHourType' : 'category', 'WorkPubPriv' : 'category', 'WorkParkPoss' : 'category', 'RespHasBicycle' : 'category', 
                        'RespHasSeasonticket' : 'category', 'ResphasDrivlic' : 'category', 'RespDrivlicYear' : 'category', 'RespIsmemCarshare' : 'category', 'HousehNumcars' : 'category', 
                        'HousehCarOwnership' : 'category', 'Handicap' : 'category', 'HousehAccomodation' : 'category', 'HousehAccOwnorRent' : 'category', 'NuclFamType' : 'category', 
                        'PosInFamily' : 'category', 'NuclFamNumAdults' : 'category', 'NuclFamNumDrivLic' : 'category', 'FamNumAdults' : 'category', 'FamNumDrivLic' : 'category', 
                        'DayStartNUTS' : 'category', 'DayStartCityCode' : 'category', 'DayStartJourneyRole' : 'category', 'DayStartPurp' : 'category', 'PrimModeDay' : 'category', 
                        'ModeChainTypeDay' : 'category', 'DayNumJourneys' : 'category', 'JstartType' : 'category', 'JstartNearestStation' : 'category', 'DayJourneyType' : 'category', 
                        'DayPrimTargetPurp' : 'category'}
# Load the data
samp_df = pd.read_csv('sampling_df_no_nan.txt', sep=',', dtype=df_dtypes)
vars_drop = ['HomeAdrCityCode', 'HomeAdrNearestStation', 'DayStartNUTS', 'DayStartCityCode', 
             'JstartNearestStation', 'SessionId', 'SessionWeight', 'GISdistHW', 
             'HomeAdrDistNearestStation', 'JstartDistNearestStation', 'InterviewType',
             'RespDrivlicYear'] 
# The unique values of these variables are: [1470, 501, 319, 1472, 502] and therefore I choose to drop them
# 'SessionId', 'SessionWeight' contain no information about the individuals so we also drop them
samp_df.drop(vars_drop, axis=1, inplace=True)

# Definition of global lists for numerical and categorical variables. These will be used for pre processing and estimation
# Numerical and Categorical variable definitions 
numerical = ['IncRespondent2000', 'TotalLenExclComTrans', 'TotalMotorLen', 'TotalBicLen', 'TotalMin', 'TotalMotorMin', 'WorkHoursPw', 'HousehNumPers1084',
            'TotalNumTrips', 'NumTripsCorr', 'NumTripsExclComTrans', 'HomeAdrCitySize', 'HwDayspW', 'WorkatHomeDayspM', 'NightsAway', 'RespAgeCorrect']
numerical_col_n = len(numerical) # Number of numerical variable

categorical = [col for col in list(samp_df) if col not in numerical]
samp_df[categorical] = samp_df[categorical].astype('category')

samp_df = samp_df[numerical + categorical]

categorical = [col for col in list(samp_df) if col not in numerical]
samp_df[categorical] = samp_df[categorical].astype('category')
categorical_col_n = len(categorical) # Number of categorical variables
categories_n = [] # Number of categories in each categorical variable
for cat in categorical:
    categories_n.append(len(samp_df[cat].cat.categories)) 
categories_cum = list(np.cumsum(categories_n)) # Cumulative sum of number of categorical variables
categories_cum = [x for x in categories_cum] # We take one out because they will be indexes
categories_cum = [0] + categories_cum

# Sort the dataset columns
samp_df = samp_df.reindex(numerical+categorical, axis=1)

BINNING=True
train, test, validation, pre_one_hot_df, one_hot_df, col_names = TUutils.data_creator(samp_df, numerical=numerical, train_prop=0.812, val_prop=0.5, binning=True, condition_on=None) # , quantiles=5 if dim too big

n_inputs = train.shape[1]

# Re-Definition of global lists for numerical and categorical variables. This is only ran iff we binned the numerical variables 
if BINNING: 
    numerical = []
    numerical_col_n = len(numerical) # Number of numerical variable

    categorical = [col for col in list(pre_one_hot_df) if col not in numerical]
    pre_one_hot_df[categorical] = pre_one_hot_df[categorical].astype('category')
    categorical_col_n = len(categorical) # Number of categorical variables
    categories_n = [] # Number of categories in each categorical variable
    for cat in categorical:
        categories_n.append(len(pre_one_hot_df[cat].cat.categories)) 
    categories_cum = list(np.cumsum(categories_n)) # Cumulative sum of number of categorical variables
    categories_cum = [x for x in categories_cum] # We take one out because they will be indexes
    categories_cum = [0] + categories_cum

TotalBicLen dropped
WorkatHomeDayspM dropped
Train shape is: (43531, 680)
Validation shape is: (43530, 680)
Test shape is: (20157, 680)


In [None]:
assert  all(val<100 for (n, val) in enumerate(categories_n))
for i in zip(categorical, categories_n):
    print(i)

# WGAN main

In [None]:
epochs_WGAN = 10000
wgan_latent_dim = 100
_WGAN = WGAN(train=train, validation=validation, numerical_col_n=numerical_col_n,
             categorical_col_n = categorical_col_n, categories_n = categories_n, 
             categories_cum = categories_cum, # Data
             eval_set=['MunicipalityOrigin', 'IncRespondent2000'], 
             col_names=col_names, original_df=samp_df, pre_one_hot_df=pre_one_hot_df,# Data
             intermediate_dim_gen=1024, latent_dim=wgan_latent_dim, n_hidden_layers_gen=1, # Generator architecture 
             intermediate_dim_crit=1024, n_hidden_layers_crit=1, # Critic architecture 
             batch_size=256, epochs=epochs_WGAN, gen_learn_rate=7.51986904e-04 ,
             crit_learn_rate=7.51986904e-04 , clip_value=0.01, nCritic=5) # drop_rate_g=0., drop_rate_c=0.25
_WGAN.wgan_evaluate()



0 [D loss: -0.102705, acc.: 0.00%] [G loss: 0.182222]
100 [D loss: -0.084792, acc.: 0.00%] [G loss: 0.009491]
200 [D loss: -0.070808, acc.: 0.00%] [G loss: 0.005792]
300 [D loss: -0.064357, acc.: 0.00%] [G loss: 0.004102]
400 [D loss: -0.055730, acc.: 0.00%] [G loss: 0.007309]
500 [D loss: -0.067559, acc.: 0.00%] [G loss: 0.002076]
600 [D loss: -0.061318, acc.: 0.00%] [G loss: 0.006647]
700 [D loss: -0.057892, acc.: 0.20%] [G loss: 0.003955]
800 [D loss: -0.059718, acc.: 0.00%] [G loss: -0.001528]
900 [D loss: -0.056706, acc.: 0.00%] [G loss: 0.002873]
1000 [D loss: -0.058395, acc.: 0.00%] [G loss: -0.003309]
1100 [D loss: -0.053401, acc.: 0.00%] [G loss: -0.000031]
1200 [D loss: -0.050173, acc.: 0.00%] [G loss: 0.000381]
1300 [D loss: -0.053276, acc.: 0.20%] [G loss: -0.003076]
1400 [D loss: -0.048759, acc.: 0.00%] [G loss: -0.001267]
1500 [D loss: -0.052517, acc.: 0.00%] [G loss: 0.002058]
1600 [D loss: -0.056020, acc.: 0.78%] [G loss: -0.004974]
1700 [D loss: -0.049307, acc.: 0.00%]

In [None]:
epochs_WGAN = 10000
wgan_latent_dim = 100
_WGAN = WGAN(train=train, validation=validation, numerical_col_n=numerical_col_n,
             categorical_col_n = categorical_col_n, categories_n = categories_n, 
             categories_cum = categories_cum, # Data
             eval_set=['MunicipalityOrigin', 'IncRespondent2000'], 
             col_names=col_names, original_df=samp_df, pre_one_hot_df=pre_one_hot_df,# Data
             intermediate_dim_gen=1024, latent_dim=wgan_latent_dim, n_hidden_layers_gen=1, # Generator architecture 
             intermediate_dim_crit=1024, n_hidden_layers_crit=1, # Critic architecture 
             batch_size=256, epochs=epochs_WGAN, gen_learn_rate=1.51986904e-04,
             crit_learn_rate=1.51986904e-04 , clip_value=0.01, nCritic=5) 
_WGAN.wgan_evaluate()

#latent 150 batch 128 no
#latent 50 inter 512 no
#change in dropout or optimizer no

In [None]:
plt.plot(range(epochs_WGAN), _WGAN.gen_loss, 'k', range(epochs_WGAN), _WGAN.crit_loss);

In [None]:
_WGAN.n_samples = 5000
wgan_n  = TUutils.samples_to_df(_WGAN.sampler(), print_duplicates=False, col_names=col_names, original_df=samp_df, pre_one_hot_df=pre_one_hot_df)
categorical = ['MunicipalityOrigin','MunicipalityDest', 'Sector', 'IncRespondent2000']# ['IncRespondent2000', 'ResphasDrivlic', 'RespAgeCorrect', 'PopSocio']  wgan50ndo
_ = validationUtils.evaluate(real=test, model=wgan_n, agg_vars=categorical, col_names=col_names, original_df=samp_df, pre_one_hot_df=pre_one_hot_df, n_samples=80000, plot=True)

In [None]:
_WGAN.n_samples = 500
wgan_n  = TUutils.samples_to_df(_WGAN.sampler(), print_duplicates=False, col_names=col_names, original_df=samp_df, pre_one_hot_df=pre_one_hot_df)
wgan_n

In [None]:
_WGAN.n_samples = 5000
wgan_n  = TUutils.samples_to_df(_WGAN.sampler(), print_duplicates=False, col_names=col_names, original_df=samp_df, pre_one_hot_df=pre_one_hot_d
wagn_n.to_csv('wgan_samples.txt', sep=',', index=False)