# ENIGMA TECH TATVA 2020 - KAGGLE COMPETITIONS
### PERSONAL SUBMISSION
- [1. DOWNLOAD DATA](#1)
- [2. CREATE TRAIN/VAL DATASET](#2)
- [3. DATA NORMALIZATION](#3)
- [4. ONE-HOT ENCODING](#4)
- [5. CREATE DATASET (TensorFlow)](#5)
- [6. CREATE PERSONAL CALLBACK](#6)
- [7. BUILD AND COMPILE THE MODEL](#7)
- [8. FIT THE MODEL](#8)
- [9. SAVE THE BEST RESULT](#9)
- [10. GENERATE ASSIGMENT](#10)

In [1]:
# Install this package to use Colab's GPU for training
!apt install --allow-change-held-packages libcudnn8=8.4.1.50-1+cuda11.6

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following packages will be REMOVED:
  libcudnn8-dev
The following held packages will be changed:
  libcudnn8
The following packages will be upgraded:
  libcudnn8
1 upgraded, 0 newly installed, 1 to remove and 10 not upgraded.
Need to get 420 MB of archives.
After this operation, 1,622 MB disk space will be freed.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  libcudnn8 8.4.1.50-1+cuda11.6 [420 MB]
Fetched 420 MB in 13s (32.9 MB/s)
(Reading database ... 123934 files and directories currently installed.)
Removing libcudnn8-dev (8.1.1.33-1+cuda11.2) ...
update-alternatives: removing manually selected alternative - switching libcudnn to auto mode
(Reading database ... 123911 files and directories currently installed.)
Prepar

## Also you can download the train.csv file from:
https://www.kaggle.com/competitions/enigma-tech-tatva-2022/data

<a name='1'></a>
## 1. DOWNLOAD DATA

In [2]:
#Mounted in personal files in Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import csv
# Initialize dictionary
columns={'carat':[],'cut':[],'color':[],'clarity':[],'depth':[],'table':[],'x':[],'y':[],'z':[],'price':[]}

# Open CSV file
with open('/content/drive/MyDrive/Colab Notebooks/train.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')  # Initialize reader
    next(reader)                                 # Skip the first line
    for row in reader:                           # Append row 
        for col in range(len(columns)):
            columns[list(columns.keys())[col]].append(row[col])

In [4]:
import pandas as pd
import numpy as np

df = pd.DataFrame.from_dict(columns)       #Convert columns 'dictionary' to DataFrame
df = df.astype({"carat": np.float64,       #Specify the type of the values
                "depth":np.float64, 
                "table":np.float64, 
                "x":np.float64, 
                "y":np.float64, 
                "z":np.float64,
                "price":np.float64})

<a name='2'></a>
## 2. CREATE TRAIN/VAL DATASET

In [5]:
n = len(df)                     #Number of data
train_df = df[0:int(n*0.8)]     #Split 80% for training set
val_df = df[int(n*0.8):]        #Split the rest of 20% for validation set
num_features = df.shape[1]      #Specify the number of features (So far)
df.head()                       #Display the first four rows of data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.544971,Ideal,E,SI1,63.911017,52.575829,5.447452,4.053076,3.701485,3.558
1,2.073682,Premium,J,SI1,67.642925,57.660288,6.266665,7.661507,6.177051,24.738
2,0.606198,Premium,G,VS1,55.61731,60.388787,6.800151,5.096966,3.480326,3.63
3,0.695397,Premium,G,SI2,60.587811,71.325721,5.366931,6.616767,4.454435,4.218
4,0.282651,Ideal,E,VVS2,68.970056,54.372123,4.920104,4.409408,4.361358,1.578


<a name='3'></a>
## 3. DATA NORMALIZATION

In [10]:
#Create a new DataFrame with only the columns with type "number" data
df_num = train_df.select_dtypes('number')

#Obtain the min / max value from the previous DataFrame
train_min = train_df.min(numeric_only=True)
train_max = train_df.max(numeric_only=True)

#Data normalization with the min / max values
norm_train=(train_df[df_num.columns] - train_min) / (train_max-train_min)
norm_val=(val_df[df_num.columns] - train_min) / (train_max-train_min)

#Save the values from min / max prices, to use in the future for data de-normalization
min_price=train_min['price']
max_price=train_max['price']

#Create the normalized data with the previous values
norm_train=norm_train.join(train_df[['cut','color','clarity']])
norm_val=norm_val.join(val_df[['cut','color','clarity']])

#Check the first rowns from the Normalized DataFrame
norm_train.head()

Unnamed: 0,carat,depth,table,x,y,z,price,cut,color,clarity
0,0.118571,0.583269,0.325378,0.470349,0.30098,0.461505,0.078672,Ideal,E,SI1
1,0.464025,0.673428,0.431509,0.546659,0.667567,0.77282,0.652066,Premium,J,SI1
2,0.132407,0.382902,0.488462,0.596354,0.407031,0.433694,0.080622,Premium,G,VS1
3,0.152564,0.502984,0.716755,0.462849,0.56143,0.556193,0.09654,Premium,G,SI2
4,0.059293,0.705491,0.362873,0.421227,0.33718,0.544488,0.025069,Ideal,E,VVS2


<a name='4'></a>
## 4. ONE-HOT ENCODING

In [11]:
#Use "get_dummies" to obtain the "one-hot" encoding from the strings columns
train_features = pd.get_dummies(norm_train).drop('price', axis=1)
val_features = pd.get_dummies(norm_val).drop('price', axis=1)

#Separate the columns of the prices and save as the labels
train_labels = norm_train['price']
val_labels = norm_val['price']

<a name='5'></a>
## 5. CREATE DATASET (TensorFlow)

In [14]:
import tensorflow as tf
# Turn our data into TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf.expand_dims(train_labels,axis=1) ))
valid_dataset = tf.data.Dataset.from_tensor_slices((val_features, tf.expand_dims(val_labels,axis=1) ))

# Take the TensorSliceDataset's and turn them into prefetched batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

<a name='6'></a>
## 6. CREATE PERSONAL CALLBACK

In [16]:
def Personal_callback(model_name, metrics, threshold_metric, ep, lr_i=0.001):
    """Generate a list of personal Callbacks to use in the training process
    Args:
        model_name (string)        - Contain the name of the model
        metrics (string)           - Contains the metric to be evaluated
        threshold_metric (string)  - Specify the threshold for the metric
        ep (int)                   - The number of epoch in the training process
        lr_i                       - Specify the initial learning rate
        
    Returns:
        A list of callback functions to use in the training process
    """
    
    """
    Stop training
    """
    class stop_training(tf.keras.callbacks.Callback):                                               #Define the class
        def on_epoch_end(self, epoch, logs = {}):                                                   #Use in the end of the epoch
            if(logs.get(metrics)<threshold_metric and logs.get('val_'+metrics) <threshold_metric):  #Define threshold for metrics
                print("\Cancelling training!")
                self.model.stop_training = True                                                     #Stop the training process
    stop_train = stop_training()
    
    """
    Learning Rate Decay
    """
    global LR_init        #Define global variable
    LR_init=lr_i          #Specify the initial learning rate
    
    class learning_decay(tf.keras.callbacks.Callback):                        #Define the class
        def on_epoch_end(self, batch, logs={}):                               #Use in the end of the epoch
            lr = self.model.optimizer.lr                                      #Call the leraning rate from the model
            global LR_init                                                    #Define global variable
            new_lr = (LR_init) * 10.**(-(batch+1.)/(ep*10))                   #Define the learning rate decay function
            if lr > new_lr:                                                   #If the previous lr is greater than actual lr 
                tf.keras.backend.set_value(self.model.optimizer.lr, new_lr)   #Update the value of the learning rate
            else: 
                LR_init=lr                                                    #This used because we use the callback 'ReduceLROnPlateau'

    lr_decay = learning_decay()
    
    """
    Reduce Learning Rate
    """
    # Creating learning rate reduction callback
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor= metrics,   #Specify the metrics
                                                     factor=0.2,         # new_lr = lr * factor
                                                     patience=2,         # number of epochs with no improvement after which learning rate will be reduced
                                                     verbose=1,          # print out when learning rate goes down 
                                                     min_lr=1e-15)       # lower bound on the learning rate
    """
    Early Stopping
    """
    # Setup EarlyStopping callback to stop training if model's val_loss doesn't improve for 10 epochs
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor = metrics,         # watch the val loss metric
                                                  patience = 10,                 # if val loss decreases for 3 epochs in a row, stop training
                                                  min_delta = 0.00001,           # Minimum change in the monitored quantity to qualify as an improvement
                                                  restore_best_weights = False,  #Don't sabe the best weights, because we use the callback "ModelCheckpoint"
                                                     verbose=1)                  #Display a message
    """
    Model Checkpoint - Train metrics
    """
    # Create ModelCheckpoint callback to save model's progress
    chk_train_path = "checkpoint_path/"+ model_name +"/train/chk_train.ckpt"      # saving weights requires ".ckpt" extension
    chk_train = tf.keras.callbacks.ModelCheckpoint(filepath=chk_train_path,
                                                         monitor=metrics,         # save the model weights with best metric
                                                         save_weights_only=True,  # set to False to save the entire model
                                                         save_best_only=True,     # set to True to save only the best model instead of a model every epoch 
                                                         save_freq="epoch",       # save every epoch
                                                         verbose=0)               # don't print out whether or not model is being saved 
    
    """
    Model Checkpoint - Train metrics
    """
    # Create ModelCheckpoint callback to save model's progress
    chk_val_path = "checkpoint_path/"+ model_name +"/val/chk_val.ckpt"             # saving weights requires ".ckpt" extension
    chk_val = tf.keras.callbacks.ModelCheckpoint(filepath=chk_val_path,
                                                         monitor='val_'+metrics,   # save the model weights with best validation metric
                                                         save_weights_only=True,   # set to False to save the entire model
                                                         save_best_only=True,      # set to True to save only the best model instead of a model every epoch 
                                                         save_freq="epoch",        # save every epoch
                                                         verbose=0)                # don't print out whether or not model is being saved 
    
    return [stop_train, lr_decay, reduce_lr, early_stopping, chk_train, chk_val]   #Return a list of Callbacks

<a name='7'></a>
## 7. BUILD AND COMPILE THE MODEL

In [29]:
#BUILD THE MODEL
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(26,)),                                               #Define Input: 26 features
    tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=1)),                      #Expand the dimension
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(182, return_sequences=True)),  #Use Bidirectional LSTM
    tf.keras.layers.Dropout(0.4),                                                     #Dropout to reduce overfitting
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)),  #Use Bidirectional LSTM
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),  #Use Bidirectional LSTM
    tf.keras.layers.Dense(units=32, activation="relu"),                               #Use Dense layer
    tf.keras.layers.Dense(units=16, activation="relu"),                               #Use Dense layer
    tf.keras.layers.Dense(units=1, activation="relu"),                                #Use Dense layer for OUTPUT
    tf.keras.layers.Reshape([1, -1])                                                  #Reshape the output
])

#COMPILE THE MODEL
model.compile(loss='mse',                      #Define the loss: Mean Squared Error
              optimizer=tf.optimizers.Adam(),  #Define the optimizer
              metrics=['mae'])                 #Define the metrics

#SAVE THE INITIAL WEIGHTS (Use when try with different models)
InitialW = model.get_weights()

<a name='8'></a>
## 8. FIT THE MODEL

In [30]:
#Load the initial weights
model.set_weights(InitialW)

#Reset the value of learning rate in the model
tf.keras.backend.set_value(model.optimizer.lr, 0.001)

#Fit the model
history = model.fit(train_dataset, 
                    epochs=100,
                    validation_data=valid_dataset,
                    callbacks=Personal_callback(model_name='model/FIT_1', 
                                                metrics='mae', 
                                                threshold_metric=0.02,
                                                ep=100)
                   )

#Save the model weights (Use when you train again)
FIT_1=model.get_weights()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 20: ReduceLROnPlateau reducing learning rate to 0.00019099852070212365.
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 27: ReduceLROnPlateau reducing learning rate to 2.723077195696533e-05.
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 34: ReduceLROnPlateau reducing learning rate to 3.30437978846021e-06.
Epoch 35/100
Epoch 36/100
Epoch 36: ReduceLROnPlateau reducing learning rate to 5.61202978133224e-07.
Epoch 37/100
Epoch 38/100
Epoch 38: ReduceLROnPlateau reducing learning rate to 9.44386897572258e-08.
Epoch 39/100
Epoch 40/100
Epoch 40: ReduceLROnPlateau reducing learning rate to 1.5746351778034295e-08.
Epoch 41/100
Epoch 42/100
Ep

<a name='9'></a>
## 9. SAVE THE BEST RESULT

In [41]:
#Clone the model
model_load = tf.keras.models.clone_model(model)

#Re-define: Compile Model
model_load.compile(loss='mse',
                  optimizer=tf.optimizers.Adam(),
                  metrics=['mae'])

#Load the weights from the best results in the train metrics
model_load.load_weights("checkpoint_path/model/FIT_1/train/chk_train.ckpt")

#Evaluate the model with the validation dataset
load_evaluate=model_load.evaluate(valid_dataset, batch_size=32)



In [51]:
#Evaluate the model with the training dataset
load_evaluate=model_load.evaluate(train_dataset, batch_size=32)



<a name='10'></a>
## 10. GENERATE ASSIGMENT

In [33]:
import csv
# Initialize dictionary
columns={'carat':[],'cut':[],'color':[],'clarity':[],'depth':[],'table':[],'x':[],'y':[],'z':[]}

# Open CSV file from test file
with open('/content/drive/MyDrive/Colab Notebooks/test.csv') as csvfile: 
    reader = csv.reader(csvfile, delimiter=',')      # Initialize reader
    next(reader)                                     # Skip the first line
    for row in reader:
        for col in range(len(columns)):
            columns[list(columns.keys())[col]].append(row[col])

In [34]:
subm = pd.DataFrame.from_dict(columns)     #Convert columns 'dictionary' to DataFrame
subm = subm.astype({"carat": np.float64,   #Specify the type of the values
                "depth":np.float64, 
                "table":np.float64, 
                "x":np.float64, 
                "y":np.float64, 
                "z":np.float64})

In [42]:
#Create a new DataFrame with only the columns with type "number" data
df_num = train_df.select_dtypes('number')

#Obtain the min / max value from the Training DataFrame (Drop the colum of 'Price')
train_max = train_df.drop('price', axis=1).max(numeric_only=True)
train_min = train_df.drop('price', axis=1).min(numeric_only=True)

#Test data normalization with min/max values of the training dataset
norm_subm = (subm[df_num.columns[:-1]] - train_min) / (train_max-train_min)

#Create the normalized data with the previous values
norm_subm = norm_subm.join(subm[['cut','color','clarity']])

#Use "get_dummies" to obtain the "one-hot" encoding from the strings columns
subm_features = pd.get_dummies(norm_subm)

# Turn our data into TensorFlow Datasets
subm_dataset = tf.data.Dataset.from_tensor_slices(subm_features)
subm_dataset = subm_dataset.batch(1, drop_remainder=True).prefetch(1)

#Predict the target values using the previous test dataset and the load model
x=model_load.predict(subm_dataset)



In [43]:
#Removes dimensions of size 1 from the shape of a tensor
x=np.squeeze(x)

#De-normalize the output values (using the min/max 'price' values in the training dataset)
subm_prices=x*(max_price-min_price)+min_price

#Define a list of arange 
id_subm=np.arange(start=0, stop=len(subm_prices))

#Define a dictionary with the correspond values
dict_subm={'id':id_subm, 'price':subm_prices}

#Convert the dictionary to DataFrame
df_subm = pd.DataFrame.from_dict(dict_subm)

#Set the index column
df_subm=df_subm.set_index('id')

#Save the DataFrame as .csv file
df_subm.to_csv('/content/drive/MyDrive/Colab Notebooks/subm.zip') 