In [23]:
import pandas as pd
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] ='3'
import tensorflow as tf
import itertools
import pyarrow.parquet as pq
import pyarrow as pa

# Import data and models

In [2]:
data = pd.read_csv('../Data/LT_DU_data_for_ML.csv')
data.head()

Unnamed: 0,nace,esize_class,gender,age_class,lpk,education,experience,target
0,C,1_49,M,40-49,p721,G2,13,8.2
1,C,1_49,F,40-49,p334,G2,0,2.51
2,M,50_249,F,40-49,p522,G2,18,2.19
3,M,50_249,F,40-49,p522,G2,12,2.19
4,M,50_249,F,14-29,p522,G2,0,2.19


In [3]:
import joblib
xgb_model = joblib.load('./Models/XGBoost_model.joblib')
xgb_mc_model = joblib.load('./Models/XGBoost_manual_cat_model.joblib')
grb_model = joblib.load('./Models/GRBoost_model.joblib')
lm_model = joblib.load('./Models/LM_model.joblib')
rf_model = joblib.load('./Models/RF_model.joblib')
svr_model = joblib.load('./Models/SVR_model.joblib')
sk_voting_model = joblib.load('./Models/SklearnVoting_model.joblib')
tf_model = tf.keras.models.load_model("./Models/tf_model")

## Functions to create TensorFlow datasets and apply models

In [9]:
def df_to_dataset(data, shuffle=False, batch_size=256):
  """
  Create TensorFlow dataset from Pandas data frame.
  """
  df = data
  df = {key: value.to_numpy()[:,tf.newaxis] for key, value in data.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df)))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(data))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

def apply_models(data, melt=False): 
  """Apply all models to data and choose if result should be pivoted to longer"""
  df_ml = data.copy()
  
  df_ml['xgb'] = xgb_model.predict(data)
  df_ml['xgb_mc'] = xgb_mc_model.predict(data)
  df_ml['grb'] = grb_model.predict(data)
  df_ml['rf'] = rf_model.predict(data)
  df_ml['lm'] = lm_model.predict(data)
  df_ml['svr'] = svr_model.predict(data)
  df_ml['voting'] = sk_voting_model.predict(data)

  df_tf = df_to_dataset(data.copy(),  shuffle=False, batch_size=256)
  df_ml['tf'] = tf_model.predict(df_tf)
  
  if melt == True:
    df_ml_long = pd.melt(
                    df_ml,
                    id_vars=['nace', 'esize_class', 'gender', 'age_class', 'lpk', 'education', 'experience'],
                    var_name='model'
    )
    return df_ml_long

  return df_ml

# Apply models to survey data

In [10]:
data_with_predictions = apply_models(data, melt=False)
data_with_predictions.head()



Unnamed: 0,nace,esize_class,gender,age_class,lpk,education,experience,target,xgb,xgb_mc,grb,rf,lm,svr,voting,tf
0,C,1_49,M,40-49,p721,G2,13,8.2,4.503678,4.408886,4.17635,4.778167,4.061958,4.009178,4.390707,4.049001
1,C,1_49,F,40-49,p334,G2,0,2.51,3.051344,3.037208,2.980969,3.041897,2.992907,3.122998,3.26677,3.292751
2,M,50_249,F,40-49,p522,G2,18,2.19,3.425669,3.228116,3.172908,3.256087,3.945913,2.907824,3.278419,3.245212
3,M,50_249,F,40-49,p522,G2,12,2.19,3.298834,3.212965,3.066702,3.219982,3.849712,2.80248,3.200731,3.176937
4,M,50_249,F,14-29,p522,G2,0,2.19,2.991804,3.021976,2.920913,2.993366,3.531646,2.741929,3.123216,2.880577


In [12]:
data_with_predictions.to_csv('../Data/LT_DU_ML_results.csv', index=False)

# Apply models to all combinations of values

## Unique categorical feature values

In [13]:
nace = np.sort(data['nace'].unique())
gender = data['gender'].unique()
esize_class = np.array(['1_49', '50_249','GT_250'], dtype=object)
age_class = np.array(['14-29','30-39', '40-49', '50-59','60+'], dtype=object)
education = np.array([ 'G1', 'G2', 'G3', 'G4'], dtype=object)
lpk = np.sort(data['lpk'].unique())

In [15]:
def combination_of_feature_values(nace=nace,
                                esize_class=esize_class,
                                gender=gender,
                                age_class=age_class,
                                lpk=lpk,
                                education=education,
                                experience=[3]):

    """ 
    Inputs should be lists, even if single value is provided e.g [3] for experience
    """

    combination = []
    for t in itertools.product(*[nace, esize_class, gender, age_class, lpk, education, experience]):
        combination.append(t)

    df = pd.DataFrame(combination, columns=['nace', 'esize_class', 'gender', 'age_class', 'lpk', 'education', 'experience'])
    return df

In [21]:
combination_of_values = combination_of_feature_values(experience=[0, 2, 5, 20])

In [22]:
combination_of_values_with_predictions = apply_models(combination_of_values, melt=True)
combination_of_values_with_predictions.head()



Unnamed: 0,nace,esize_class,gender,age_class,lpk,education,experience,model,value
0,B,1_49,M,14-29,p111,G1,0,xgb,5.017599
1,B,1_49,M,14-29,p111,G1,2,xgb,5.196927
2,B,1_49,M,14-29,p111,G1,5,xgb,5.08733
3,B,1_49,M,14-29,p111,G1,20,xgb,5.074407
4,B,1_49,M,14-29,p111,G2,0,xgb,4.862865


In [25]:
table = pa.Table.from_pandas(combination_of_values_with_predictions)

In [27]:
pq.write_table(table,'../Data/LT_DU_all_feature_combinations_all_model_predictions.parquet')