### Starting from the preprocessed data, we scale continuous features and separate out categorical and continuous features. Then we split the data into training and test sets.

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


def process_data(file_path, scaler=None, fit_scaler=False, drop_columns=None):
    
    
    # Load the dataset in chunks
    chunk_size = 50000
    chunks = pd.read_csv(file_path, chunksize=chunk_size)

    # Concatenate chunks into a single dataframe
    data = pd.concat([chunk for chunk in chunks], axis=0)
    original_data = data.copy()  # Preserve original data for key retention
    
    # print column names
    print(f"Data columns b4 processing: {data.columns}")

    if drop_columns:
        data = data.drop(columns=drop_columns)

    continuous_features = [
        'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
        'distance_km', 'pickup_distance_to_JFK_Airport', 'pickup_distance_to_LaGuardia_Airport',
        'pickup_distance_to_Central_Park', 'pickup_distance_to_Times_Square',
        'dropoff_distance_to_JFK_Airport', 'dropoff_distance_to_LaGuardia_Airport',
        'dropoff_distance_to_Central_Park', 'dropoff_distance_to_Times_Square', 'dropoff_lat_lon_interaction',
        'pickup_lat_lon_interaction'
    ]

    # Scale continuous features
    continuous_data = data[continuous_features]
    if fit_scaler:
        scaler.fit(continuous_data)
    continuous_data = scaler.transform(continuous_data)

    # Extract categorical features excluding non-numeric ones
    categorical_features = ['is_peak_hour', 'day_of_week', 'month', 'part_of_day', 'day', 'passenger_count', 'year', 'is_weekend', 'hour']

    # Cast categorical features to int64 IF they are not already
    for cat in categorical_features:
        if data[cat].dtype != 'int64':
            data[cat] = data[cat].astype('int64')
    categorical_data = data[categorical_features].values

    # Offset categorical features to ensure they are non-negative
    for i, cat in enumerate(categorical_features):
        min_val = min(0, data[cat].min())
        categorical_data[:, i] -= min_val


    if 'fare_amount' in data.columns:
        y = data['fare_amount']
        return original_data, continuous_data, categorical_data, y
    else:
        return original_data, continuous_data, categorical_data

# Initialize the scaler
scaler = StandardScaler()

# Process training dataset
original_train_data, X_train_continuous, X_train_categorical, y_train = process_data('train_fe_no_IQR.csv', scaler, fit_scaler=True)

# Load the test data for testing the model
original_data, X_test_continuous, X_test_categorical = process_data('test_fe.csv', scaler, fit_scaler=True)

# Splitting the processed data into training and validation sets
X_train_continuous, X_val_continuous, X_train_categorical, X_val_categorical, y_train, y_val = train_test_split(
    X_train_continuous, X_train_categorical, y_train, test_size=0.2, random_state=42
)

###############################################################################################################
continuous_features = [
        'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
        'distance_km', 'pickup_distance_to_JFK_Airport', 'pickup_distance_to_LaGuardia_Airport',
        'pickup_distance_to_Central_Park', 'pickup_distance_to_Times_Square',
        'dropoff_distance_to_JFK_Airport', 'dropoff_distance_to_LaGuardia_Airport',
        'dropoff_distance_to_Central_Park', 'dropoff_distance_to_Times_Square', 'dropoff_lat_lon_interaction',
        'pickup_lat_lon_interaction'
    ]
categorical_features = ['is_peak_hour', 'day_of_week', 'month', 'part_of_day', 'day', 'passenger_count', 'year', 'is_weekend', 'hour']



Data columns b4 processing: Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'distance_km', 'year', 'month', 'day', 'hour',
       'day_of_week', 'is_weekend', 'is_peak_hour', 'pickup_cluster',
       'dropoff_cluster', 'pickup_distance_to_JFK_Airport',
       'dropoff_distance_to_JFK_Airport',
       'pickup_distance_to_LaGuardia_Airport',
       'dropoff_distance_to_LaGuardia_Airport',
       'pickup_distance_to_Central_Park', 'dropoff_distance_to_Central_Park',
       'pickup_distance_to_Times_Square', 'dropoff_distance_to_Times_Square',
       'pickup_lat_lon_interaction', 'dropoff_lat_lon_interaction',
       'part_of_day'],
      dtype='object')
Data columns b4 processing: Index(['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'distance_km', 'year', 'month', 'day', 'hour', 'day_of

### Neural Network Training and Evaluation:

- Next, we build an FNN with three hidden layers using Keras. We use the Adam optimizer and mean squared error (MSE) as the loss function.

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, LearningRateScheduler
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K
from sklearn.metrics import mean_squared_error

def lr_schedule(epoch, lr):
    # Reducing the learning rate every 10 epochs
    if epoch % 15 == 0 and epoch != 0:
        lr = lr / 2
    return lr

# RMSE as the custom loss function
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))


# Function to train the model
def train_model(architecture_type):
    
    # Define embedding input for categorical features
    
    embedding_inputs = []
    embeddings = []
    for cat in range(X_train_categorical.shape[1]):
        vocab_size = int(np.max(X_train_categorical[:, cat]) + 1)
        embed_dim = min(50, int(np.ceil(vocab_size / 2)))  # Set embedding size based on the heuristic
        inp = Input(shape=(1,))
        emb = Embedding(vocab_size, embed_dim, input_length=1)(inp)
        flat = Flatten()(emb)
        embedding_inputs.append(inp)
        embeddings.append(flat)

    # Continuous input
    continuous_input = Input(shape=(X_train_continuous.shape[1],))
    embedding_inputs.append(continuous_input)
    
    x = concatenate(embeddings + [continuous_input])

    if architecture_type == 'sigmoid_64_neurons_1_layer':
        x = Dense(64, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'sigmoid_128_neurons_1_layer':
        x = Dense(128, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'sigmoid_128_neurons_2_layers':
        x = Dense(128, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
        x = Dense(64, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'sigmoid_2_layers':
        x = Dense(64, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
        x = Dense(32, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'relu_2_layers':
        x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'relu_1_layer':
        x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'sigmoid_3_layers':
        x = Dense(64, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
        x = Dense(32, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
        x = Dense(16, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'relu_3_layers':
        x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dense(16, activation='relu', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'relu_4_layers':
        x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dense(16, activation='relu', kernel_regularizer=l2(0.01))(x)
        x = Dense(8, activation='relu', kernel_regularizer=l2(0.01))(x)
    elif architecture_type == 'sigmoid_4_layers':
        x = Dense(64, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
        x = Dense(32, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
        x = Dense(16, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
        x = Dense(8, activation='sigmoid', kernel_regularizer=l2(0.01))(x)

    x = BatchNormalization()(x)
    output = Dense(1)(x)

    model = Model(inputs=embedding_inputs, outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer, loss=root_mean_squared_error)

    # Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    checkpoint = ModelCheckpoint(f'best_model_{architecture_type}', monitor='val_loss', save_best_only=True, save_format='tf')
    csv_logger = CSVLogger(f'training_log_{architecture_type}.csv')
    lr_scheduler = LearningRateScheduler(lr_schedule) # Learning Rate Scheduler Callback
       

    model.fit(
    [X_train_categorical[:, i] for i in range(X_train_categorical.shape[1])] + [X_train_continuous],
    y_train, validation_data=(
        [X_val_categorical[:, i] for i in range(X_val_categorical.shape[1])] + [X_val_continuous], y_val),
    epochs=200, batch_size=512, callbacks=[early_stop, checkpoint, csv_logger, lr_scheduler]
        )
    
    # predict on validation data
    y_train_pred = model.predict([X_train_categorical[:, i] for i in range(X_train_categorical.shape[1])] + [X_train_continuous])
    y_val_pred = model.predict([X_val_categorical[:, i] for i in range(X_val_categorical.shape[1])] + [X_val_continuous])

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    # Predict on test data
    y_test_pred = model.predict([X_test_categorical[:, i] for i in range(X_test_categorical.shape[1])] + [X_test_continuous])
    
    # Create a submission DataFrame with the required format using the preserved 'key' column
    submission = pd.DataFrame({ 'key': original_data['key'], 'fare_amount': y_test_pred.flatten()})
    # Save the submission file
    submission.to_csv(f'submissions_{architecture_type}.csv', index=False)
    print(f"Submission for {architecture_type} saved.")

    with open(f'rmse_log_{architecture_type}.txt', 'w') as f:
        f.write(f"Model {architecture_type}\n")
        f.write(f"Training RMSE: {train_rmse:.2f}\n")
        f.write(f"Validation RMSE: {val_rmse:.2f}\n")
    

# for arch in ['relu_1_layer', 'relu_2_layers', 'relu_3_layers', 'sigmoid_128_neurons_1_layer', 'sigmoid_128_neurons_2_layers', 'relu_4_layers', 'sigmoid_4_layers', 'sigmoid_3_layers']:
#     train_model(arch)

for arch in [ 'relu_2_layers', 'relu_1_layer']:
    train_model(arch)
    

Epoch 1/200


INFO:tensorflow:Assets written to: best_model_relu_2_layers\assets


Epoch 2/200


INFO:tensorflow:Assets written to: best_model_relu_2_layers\assets


Epoch 3/200


INFO:tensorflow:Assets written to: best_model_relu_2_layers\assets


Epoch 4/200


INFO:tensorflow:Assets written to: best_model_relu_2_layers\assets


Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Submission for relu_2_layers saved.
Epoch 1/200


INFO:tensorflow:Assets written to: best_model_relu_1_layer\assets


Epoch 2/200


INFO:tensorflow:Assets written to: best_model_relu_1_layer\assets


Epoch 3/200


INFO:tensorflow:Assets written to: best_model_relu_1_layer\assets


Epoch 4/200
Epoch 5/200


INFO:tensorflow:Assets written to: best_model_relu_1_layer\assets


Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Submission for relu_1_layer saved.


In [4]:
continuous_features = [
        'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
        'distance_km', 'pickup_distance_to_JFK_Airport', 'pickup_distance_to_LaGuardia_Airport',
        'pickup_distance_to_Central_Park', 'pickup_distance_to_Times_Square',
        'dropoff_distance_to_JFK_Airport', 'dropoff_distance_to_LaGuardia_Airport',
        'dropoff_distance_to_Central_Park', 'dropoff_distance_to_Times_Square', 'dropoff_lat_lon_interaction',
        'pickup_lat_lon_interaction'
    ]
categorical_features = ['is_peak_hour', 'day_of_week', 'month', 'part_of_day', 'day', 'passenger_count', 'year', 'is_weekend', 'hour']


###############################################################################
# Compare train and test data structures
# Cast categorical features to int64 in the NumPy arrays
X_train_categorical = X_train_categorical.astype('int64')

print("Comparing train and test data structures for discrepancies:")
train_structures = [(X_train_continuous.shape, X_train_continuous.dtype),
                    (X_train_categorical.shape, X_train_categorical.dtype)]
test_structures = [(X_test_continuous.shape, X_test_continuous.dtype),
                (X_test_categorical.shape, X_test_categorical.dtype)]

for i, (train_struct, test_struct) in enumerate(zip(train_structures, test_structures)):
    if train_struct != test_struct:
        print(f"Discrepancy in feature set {i}:")
        print(f"Training set structure: {train_struct}")
        print(f"Test set structure: {test_struct}")

# Print columns in test data continuous and categorical
print("Test data continuous features:")
for i, feature in enumerate(continuous_features):
    print(f"Column {i} - {feature}:")
    print(f"Data Type: {X_test_continuous[:, i].dtype}, First row value: {X_test_continuous[0, i]}")

print("Test data categorical features:")
for i, feature in enumerate(categorical_features):
    print(f"Column {i} - {feature}:")
    print(f"Data Type: {X_test_categorical[:, i].dtype}, First row value: {X_test_categorical[0, i]}")

# Print columns in train data continuous and categorical
print("Train data continuous features:")
for i, feature in enumerate(continuous_features):
    print(f"Column {i} - {feature}:")
    print(f"Data Type: {X_train_continuous[:, i].dtype}, First row value: {X_train_continuous[0, i]}")

print("Train data categorical features:")
for i, feature in enumerate(categorical_features):
    print(f"Column {i} - {feature}:")
    print(f"Data Type: {X_train_categorical[:, i].dtype}, First row value: {X_train_categorical[0, i]}")

# Verification step to ensure test data is in the correct format
test_embedding_inputs = []
for i, feature_name in enumerate(categorical_features):
    test_feature = X_test_categorical[:, i].reshape(-1, 1)
    test_embedding_inputs.append(test_feature)

    # Check if test data indices are within the training range
    training_max_index = np.max(X_train_categorical[:, i])  # Max index from training
    test_max_index = np.max(X_test_categorical[:, i])       # Max index from test
    if test_max_index > training_max_index:
        print(f"Test data for feature '{feature_name}' (Column {i}) contains new categories not seen in training.")
    else:
        print(f"Test feature '{feature_name}' (Column {i}) is within the trained range.")
###############################################################################

Comparing train and test data structures for discrepancies:
Discrepancy in feature set 0:
Training set structure: ((11956383, 15), dtype('float64'))
Test set structure: ((9914, 15), dtype('float64'))
Discrepancy in feature set 1:
Training set structure: ((11956383, 9), dtype('int64'))
Test set structure: ((9914, 9), dtype('int64'))
Test data continuous features:
Column 0 - pickup_longitude:
Data Type: float64, First row value: 0.03278367469600283
Column 1 - pickup_latitude:
Data Type: float64, First row value: 0.38058296221652993
Column 2 - dropoff_longitude:
Data Type: float64, First row value: -0.198965435598109
Column 3 - dropoff_latitude:
Data Type: float64, First row value: -0.22316024981144808
Column 4 - distance_km:
Data Type: float64, First row value: -0.2799753841281503
Column 5 - pickup_distance_to_JFK_Airport:
Data Type: float64, First row value: 0.12650626204728618
Column 6 - pickup_distance_to_LaGuardia_Airport:
Data Type: float64, First row value: -0.3608875347432132
Colu

In [5]:
import os
import pandas as pd
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.utils import custom_object_scope

# RMSE as the custom loss function
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# List all model files in the model directory saved as TensorFlow SavedModel format
model_files = [file for file in os.listdir(".") if os.path.isdir(file) and file.startswith('best_model')]

# Define your test data file path
test_data_file_path = 'test_fe.csv'

# Iterate over the model files and perform inference
for model_dir in model_files:
    with custom_object_scope({'root_mean_squared_error': root_mean_squared_error}):
        model = load_model(model_dir)
        print(model.summary())

    # Process the test data
    # Assuming process_data is already defined and available in your environment
    scaler = StandardScaler()
    original_data, X_test_continuous, X_test_categorical = process_data(
        test_data_file_path, scaler=scaler, fit_scaler=True, drop_columns=['pickup_datetime']
    )   
    
    # Printing input tensors
    print("Inspecting Input Tensors:")
    for i in range(X_test_categorical.shape[1]):
        print(f"Categorical Input {i} Shape: {X_test_categorical[:, i].shape}")
        print(f"Sample Data: {X_test_categorical[:5, i]}")  # Prints first 5 samples

    print(f"Continuous Input Shape: {X_test_continuous.shape}")
    print(f"Sample Data: {X_test_continuous[:5, :]}")  # Prints first 5 samples of continuous data
    
    # Convert each categorical input to float32 and reshape
    categorical_inputs = [X_test_categorical[:, i].astype('float32').reshape(-1, 1) for i in range(X_test_categorical.shape[1])]

    # Print the number of categorical inputs
    print(f"Number of categorical inputs: {len(categorical_inputs)}")

    # Make predictions
    test_predictions = model.predict(categorical_inputs + [X_test_continuous])

    # Create a DataFrame for the submission
    submission = pd.DataFrame({
        'key': original_data['key'],
        'fare_amount': test_predictions.flatten()
    })

    # Define the output file name without the '.h5' extension
    output_file_name = f'submissions_{model_dir}.csv'

    # Save the submission file
    submission.to_csv(output_file_name, index=False)
    print(f"Submission saved to {output_file_name}.")


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                              