In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,plot_roc_curve,confusion_matrix
from normalize_data import normalize_data
from sklearn.model_selection import StratifiedKFold
from kerastuner.tuners import RandomSearch

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [4]:
df = normalize_data(df)
df_test = normalize_data(df_test)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21765 entries, 0 to 22004
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   job_level                              21765 non-null  object 
 1   job_duration_in_current_job_level      21765 non-null  float64
 2   person_level                           21765 non-null  object 
 3   job_duration_in_current_person_level   21765 non-null  float64
 4   job_duration_in_current_branch         21765 non-null  float64
 5   Employee_type                          21765 non-null  object 
 6   gender                                 21765 non-null  object 
 7   age                                    21765 non-null  int64  
 8   marital_status_maried(Y/N)             21765 non-null  object 
 9   number_of_dependences                  21765 non-null  int64  
 10  Education_level                        21765 non-null  object 
 11  GP

In [6]:
col = df.pop("Best Performance")
df.insert(25,col.name,col)
obj_columns_train = df.select_dtypes(['object']).columns
df[obj_columns_train] = df[obj_columns_train].astype('category')
#merubah category menjadi int column
cat_columns_train = df.select_dtypes(['category']).columns
df[cat_columns_train] = df[cat_columns_train].apply(lambda x:x.cat.codes)
#menaruh kolum Best Performance ke index terakhir 
df[cat_columns_train]

Unnamed: 0,job_level,person_level,Employee_type,gender,marital_status_maried(Y/N),Education_level,achievement_target_1,achievement_target_2,achievement_target_3
0,2,5,0,0,1,4,2,3,1
1,1,3,0,1,1,4,2,2,1
2,1,2,0,1,1,5,1,3,0
3,1,3,0,1,1,3,2,2,0
4,1,2,0,1,1,4,3,4,0
...,...,...,...,...,...,...,...,...,...
22000,1,2,1,1,1,4,2,2,1
22001,0,0,1,1,0,4,2,3,1
22002,1,2,0,1,1,4,2,2,1
22003,1,2,0,1,1,4,1,2,0


In [7]:
obj_columns_test = df_test.select_dtypes(['object']).columns
df_test[obj_columns_test] = df_test[obj_columns_test].astype('category')
#merubah category menjadi int column
cat_columns_test = df_test.select_dtypes(['category']).columns
df_test[cat_columns_test] = df_test[cat_columns_test].apply(lambda x:x.cat.codes)
df_test[cat_columns_test]

Unnamed: 0,job_level,person_level,Employee_type,gender,marital_status_maried(Y/N),Education_level,achievement_target_1,achievement_target_2,achievement_target_3
0,2,4,2,0,1,3,2,3,1
1,1,2,0,1,0,4,3,3,1
2,1,2,0,1,1,4,2,3,1
3,2,5,0,1,1,4,2,3,0
4,1,2,0,0,1,3,2,4,1
...,...,...,...,...,...,...,...,...,...
5995,0,0,1,1,0,4,2,3,0
5996,1,2,1,1,1,3,2,3,1
5997,1,2,0,1,1,4,1,1,1
5998,1,2,0,1,1,4,1,2,0


In [8]:
train= reduce_mem_usage(df)
test = reduce_mem_usage(df_test)
print("Shape of train set: ",train.shape)
print("Shape of train set: ",test.shape)

Mem. usage decreased to  0.89 Mb (71.9% reduction)
Mem. usage decreased to  0.23 Mb (71.7% reduction)
Shape of train set:  (21765, 26)
Shape of train set:  (6000, 25)


In [9]:
scaler = MinMaxScaler()
data = train.values
ix = [i for i in range(data.shape[1]) if i != 25]
X, y = data[:, ix], data[:, 25]
X = scaler.fit_transform(X)

In [10]:
cat_columns_train = cat_columns_train.to_list()
cat_columns_test = cat_columns_test.to_list()

In [11]:
def build_model(hp):
    model = keras.Sequential()
    #Hidden layers 2- 20
    for i in range(hp.Int('num_layers',2,100)):
        model.add(layers.Dense(units=hp.Int('units_' + str(i),min_value=32, max_value=512, step=32),activation='relu'))
        model.add(layers.Dense(1, activation = "sigmoid"))
        model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate',[1e-2,1e-3,1e-4,1e-5,1e-6])), loss='auc', metrics=['auc'])
    return model

In [12]:
tuner = RandomSearch(build_model, objective='val_accuracy', max_trials=5, executions_per_trial=3,directory="Project People Analytics", project_name="BRI People Analytics")

INFO:tensorflow:Reloading Oracle from existing project Project People Analytics\BRI People Analytics\oracle.json


In [13]:
tuner.search_space_summary()

Search space summary
Default search space size: 4
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 100, 'step': 1, 'sampling': None}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': None}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001, 1e-05, 1e-06], 'ordered': True}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': None}


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 0)

In [17]:
tuner.search(X_train, y_train, epochs=250, validation_data=(X_test,y_test))


Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
num_layers        |82                |?                 
units_0           |416               |?                 
learning_rate     |0.001             |?                 
units_1           |128               |?                 

Epoch 1/250


ValueError: in user code:

    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:747 train_step
        y_pred = self(x, training=True)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\sequential.py:372 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\functional.py:386 call
        inputs, training=training, mask=mask)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:976 __call__
        self.name)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:216 assert_input_compatibility
        ' but received input with shape ' + str(shape))

    ValueError: Input 0 of layer dense is incompatible with the layer: expected axis -1 of input shape to have value 625 but received input with shape [None, 25]


In [None]:
tuner.results_summary()

In [16]:
# folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=31416)
# target=train['Best Performance']
# features= [c for c in train.columns if c not in ['Best Performance']]
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
#     print("Fold {}".format(fold_))
#     X_trn_data, y_trn_data = train.iloc[trn_idx][features], target.iloc[trn_idx],
#     X_val_data, y_val_data = train.iloc[val_idx][features], target.iloc[val_idx]
    

Fold 0

Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
num_layers        |82                |?                 
units_0           |416               |?                 
learning_rate     |0.001             |?                 
units_1           |128               |?                 

Epoch 1/100


ValueError: in user code:

    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:747 train_step
        y_pred = self(x, training=True)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\sequential.py:372 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\functional.py:386 call
        inputs, training=training, mask=mask)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:976 __call__
        self.name)
    C:\Users\Asus\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:216 assert_input_compatibility
        ' but received input with shape ' + str(shape))

    ValueError: Input 0 of layer dense is incompatible with the layer: expected axis -1 of input shape to have value 625 but received input with shape [None, 25]
