# Importing packages 

In [None]:
# Data wrangling 
import pandas as pd 

# Deep learning 
import tensorflow as tf
import keras 

# Array math 
import numpy as np

# One hot encoding
from sklearn.preprocessing import OneHotEncoder

# Mean scaler 
from sklearn.preprocessing import MinMaxScaler

# Loading the memory profile extension
from memory_profiler import profile
import sys 

# Ploting 
import matplotlib.pyplot as plt

# Iteration tracking
from tqdm import tqdm

# Metrics 
from sklearn.metrics import mean_squared_error

# Reading the data 

In [None]:
d = pd.read_csv('data/train.csv')

print(f"Shape of the data: {d.shape}")
print(d.describe().round(2))

In [None]:
d.dtypes

In [None]:
# Getting the size of the object in memory
print(f"The object takes: {sys.getsizeof(d) / 10**6} MB in memory")

The data in memory uses ~484MB of RAM 

# Feature engineering 

## Date variables 

In [None]:
def create_date_vars(d):
    """
    Creates the datetime variables
    """
    # Infering the day of the week from pickup_datetime
    d['pickup_datetime'] = pd.to_datetime(d['pickup_datetime'])
    d['pickup_dayofweek'] = d['pickup_datetime'].dt.dayofweek

    # Infering the hour of the day from pickup_datetime
    d['pickup_hour'] = d['pickup_datetime'].dt.hour

    # Creating a new variable for the day of the year
    d['pickup_dayofyear'] = d['pickup_datetime'].dt.dayofyear

    # Ensuring a monotonic relationship between pickup_hour and pickup_dayofyear
    d['pickup_hour_sin'] = np.sin(2 * np.pi * d['pickup_hour']/23.0)
    d['pickup_hour_cos'] = np.cos(2 * np.pi * d['pickup_hour']/23.0)

    d['pickup_dayofyear_sin'] = np.sin(2 * np.pi * d['pickup_dayofyear']/365.0)
    d['pickup_dayofyear_cos'] = np.cos(2 * np.pi * d['pickup_dayofyear']/365.0)

    return d

## Dummy variables

The features that will be one-hot encoded: 

* store_and_fwd_flag
* vendor_id 
* pickup_dayofweek

In [None]:
# Defining the dummy var list 
dummy_features = [
    'vendor_id',
    'store_and_fwd_flag',
    'pickup_dayofweek'
]

# Defining the function for dummy creation 
def create_dummy(df, dummy_var_list):
    # Placeholder for the dummy variables
    added_features = []
    for var in dummy_var_list:
        dummy = pd.get_dummies(df[var], prefix=var, drop_first=True)
        
        # Adding the new features to list 
        added_features.extend(dummy.columns)

        # Adding the dummy variables to the dataframe
        df = pd.concat([df, dummy], axis=1)
        df.drop(var, axis=1, inplace=True)

    # Returning the dataframe 
    return df, added_features

## Distance of travel 

In [None]:
# Defining the function for distance calculation
def distance_calculation(df):
    """
    Calculates the distance between two points on the earth's surface.

    The distance is in meters
    """
    R = 6373.0

    lat1 = np.radians(df['pickup_latitude'])
    lon1 = np.radians(df['pickup_longitude'])
    lat2 = np.radians(df['dropoff_latitude'])
    lon2 = np.radians(df['dropoff_longitude'])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distance = R * c

    # Saving the distance to the dataframe
    df['distance'] = distance * 1000 # Converting to meters
    return df 

## Final feature list and the ft engineering pipeline 


In [None]:
# Defining the final feature list 
numeric_features = [
    'distance',
    'passenger_count', 
    'pickup_hour_sin',
    'pickup_hour_cos',
    'pickup_dayofyear_sin',
    'pickup_dayofyear_cos',
]

# Defining the target variable
target = 'trip_duration'

# Defining the ft engineering pipeline 
def ft_engineering_pipeline(
    df, 
    numeric_features, 
    dummy_features,
    target):
    """
    Applies the feature engineering pipeline to the data
    """
    # Creating the date variables
    df = create_date_vars(df)

    # Creating the dummy variables
    df, new_features = create_dummy(df, dummy_features)

    # Appending the distance
    df = distance_calculation(df) 

    # Appending the new features to the numeric features
    final_features = numeric_features + new_features

    # Creating the x matrix 
    x = df[final_features].values

    # Creating the y vector
    y = df[target].values

    # Mean max scaling the y matrix 
    y = y.reshape(-1, 1)
    scaler = MinMaxScaler()
    y = scaler.fit_transform(y)

    # Returning the x and y matrices
    return x, y, final_features

# Creating the input for model 

In [None]:
x, y, features = ft_engineering_pipeline(d, numeric_features, dummy_features, target)

In [None]:
print(f"Shape of x: {x.shape} | Shape of y: {y.shape}")
print(features)

# Deep learning 

In [None]:
# Defining the batch size and number of epochs 
batch_size = 512
epochs = 10

In [None]:
# Defining the model function 
def train(x, y, epochs: int = 10, batch_size: int = 128): 
    # Defining a simple feed forward network 
    model = keras.Sequential([
        keras.layers.Dense(128, activation=tf.nn.relu, input_shape=(x.shape[1],)),
        keras.layers.Dense(128, activation=tf.nn.relu),
        keras.layers.Dense(1)
    ])

    # Compiling the model
    model.compile(
        optimizer='adam',
        loss='mean_squared_error',
        metrics=['mean_squared_error']
    )

    # Fitting the model
    history = model.fit(x, y, epochs=epochs, batch_size=batch_size)

    # Returning the model
    return model, history

In [None]:
# Training the model with RAM usage 
model, history = train(x, y, epochs=epochs, batch_size=batch_size)

In [None]:
plt.plot(history.history['mean_squared_error'])

# Using an iterator to load the data

In [None]:
# Creating an iterator over the csv 
d = pd.read_csv('data/train.csv', chunksize=batch_size, iterator=True)

# Getting the size of the object in memory
print(f"The object takes: {sys.getsizeof(d) / 10**6} MB in memory")

# Iterating over the chunks to get the final number of batches 
n_batches = 0

# Creating the min-max constants for y
min = np.inf
max = -np.inf

# Creating a dictionary for the categorical features that will store unique values
cat_dict = {}

for chunk in tqdm(d):
    # Searching for the min and max values of y
    if chunk['trip_duration'].min() < min:
        min = chunk['trip_duration'].min()
    if chunk['trip_duration'].max() > max:
        max = chunk['trip_duration'].max()

    # Creating the date variables
    chunk = create_date_vars(chunk)

    # Iterating over the cate features and getting the unique values
    for cat in dummy_features:
        if cat not in cat_dict.keys():
            cat_dict[cat] = list(set(chunk[cat].unique()))
        else:
            # Extracting the current unique values
            current_unique = list(set(chunk[cat].unique()))

            # Getting the new unique values
            new_unique = list(set(current_unique) - set(cat_dict[cat]))

            # Adding the new unique values to the dictionary
            cat_dict[cat].extend(new_unique)

    n_batches += 1

print(f"The number of batches is: {n_batches}")

In [None]:
# Creating a one hot encoder for the categorical features
cat_encoders = {}
for cat_feature in cat_dict.keys():
    # Creating the one hot encoder
    one_hot = OneHotEncoder(categories='auto')

    # Fitting the one hot encoder
    one_hot.fit(np.array(cat_dict[cat_feature]).reshape(-1, 1))

    # Saving the encoder to the dictionary
    cat_encoders[cat_feature] = one_hot

In [None]:
# Defining the final feature list 
final_features = [
    'distance',
    'passenger_count', 
    'pickup_hour_sin',
    'pickup_hour_cos',
    'pickup_dayofyear_sin',
    'pickup_dayofyear_cos',
]

# Adding the final features from the one hot encoders
for cat_feature in cat_encoders.keys():
    # Extracting all original values
    original_values = cat_dict[cat_feature]

    # Getting the transformed values
    out_values = cat_encoders[cat_feature].get_feature_names_out().tolist()

    # Adding the names of the feature as a prefix
    new_features = [f"{cat_feature}_{value.split('_')[-1]}" for value in out_values]

    # Adding the new features to the list
    final_features.extend(new_features)

In [None]:
# Defining a custom label encoding function 
def custom_transform(enc, x, prefix):
    """
    Applies a custom transformation to the data
    """
    # Transforming the data
    out = enc.transform(x.reshape(-1, 1))

    # Getting the transformed values
    out_values = enc.get_feature_names_out().tolist()

    # Adding the names of the feature as a prefix
    out_values = [f"{prefix}_{value.split('_')[-1]}" for value in out_values]

    # Converting to a dataframe
    out = pd.DataFrame(out.toarray(), columns=out_values)

    # Changing the datatype to uint8
    out = out.astype('uint8')

    # Returning the transformed data
    return out

# Defining a list of dummy features 
dummy_features = [
    'vendor_id',
    'store_and_fwd_flag',
    'pickup_dayofweek',
]

In [None]:
# Defining the class for the batches creation 
class DataGenerator(keras.utils.Sequence):
    def __init__(
        self, 
        csv_generator,
        n_batches
        ):
        self.csv_generator = csv_generator
        self.n_batches = n_batches

    def __len__(self):
        """
        The total length of the iterator
        """
        return self.n_batches

    def __getitem__(self, idx):
        """
        The batch generator 
        """
        # Getting the batch
        chunk = self.csv_generator.get_chunk()

        # Reseting the index
        chunk = chunk.reset_index(drop=True)

        # Creating the date variables
        chunk = create_date_vars(chunk)

        # Creating the distance variable
        chunk = distance_calculation(chunk) 

        # Creating the dummy variables
        for cat_feature in dummy_features:
            # Extracting the values
            x = chunk[cat_feature].values

            # Transforming the data
            out = custom_transform(cat_encoders[cat_feature], x, cat_feature)

            # Concatenating the data
            chunk = pd.concat([chunk, out], axis=1)

            # Deleting the out, x from memory
            del out, x

        # Getting the target var 
        y = chunk[target].values

        # Min max transforming the y 
        y = (y - min) / (max - min)

        # If any of the final features are missing we fill them with 0
        missing_cols = set(final_features) - set(chunk.columns)
        for c in missing_cols:
            chunk[c] = 0

        # Extracting the final features
        x = chunk[final_features].values

        return x, y

In [None]:
def train_generator(
    path_to_csv,
    n_batches,
    final_features,
    epochs: int = 10,
    batch_size: int = 128
    ): 
    # Defining a simple feed forward network 
    model = keras.Sequential([
        keras.layers.Dense(128, activation=tf.nn.relu, input_shape=(len(final_features),)),
        keras.layers.Dense(128, activation=tf.nn.relu),
        keras.layers.Dense(1)
    ])

    # Compiling the model
    model.compile(
        optimizer='adam',
        loss='mean_squared_error',
        metrics=['mean_squared_error']
    )

    for epoch in range(epochs):
        # Creating the generator
        d = pd.read_csv(path_to_csv, chunksize=batch_size, iterator=True)
        generator = DataGenerator(d, n_batches)

        # Fitting the model
        model.fit(generator, epochs=1, verbose=1, batch_size=batch_size)

    # Returning the model
    return model

In [None]:
# Training the model 
model = train_generator(
    path_to_csv='data/train.csv',
    n_batches=n_batches,
    final_features=final_features
)

In [None]:
model.weights