# Importing packages 

In [None]:
# Data wrangling 
import pandas as pd 

# Deep learning 
import tensorflow as tf
import keras 

# Array math 
import numpy as np

# One hot encoding
from sklearn.preprocessing import OneHotEncoder

# Mean scaler 
from sklearn.preprocessing import MinMaxScaler

# Loading the memory profile extension
from memory_profiler import profile
import sys 

# Ploting 
import matplotlib.pyplot as plt

# Iteration tracking
from tqdm import tqdm

# Metrics 
from sklearn.metrics import mean_squared_error

# Importing the feature engineering functions 
from utils import distance_calculation, create_date_vars, create_dummy

# Training on CPU
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Reading the data 

In [None]:
d = pd.read_csv('data/train.csv', chunksize=512)

# Defining the number of chunks to read 
n_chunks = 1000

df = pd.DataFrame({})
for i, chunk in tqdm(enumerate(d)):
    if i == n_chunks:
        break
    df = pd.concat([df, chunk], axis=0)

# Removing the negative fare_amount
df = df[df['fare_amount'] > 0].copy()

# Dropping missing rows
df.dropna(inplace=True)

# reseting the index
df.reset_index(inplace=True, drop=True)

print(f"Shape of the data: {df.shape}")
print(df.describe().round(2))

In [None]:
df.head()

In [None]:
# Getting the size of the object in memory
print(f"The object takes: {sys.getsizeof(df) / 10**6} MB in memory")

# Feature engineering 

## Date variables 

In [None]:
df = create_date_vars(df)

In [None]:
df.head()

## Dummy variables

The features that will be one-hot encoded: 

* pickup_dayofweek

In [None]:
# Defining the dummy var list 
dummy_features = [
    'pickup_dayofweek'
]

# Creating the dummy varsW
df, new_features = create_dummy(df, dummy_features)

In [None]:
df.head()

## Distance of travel 

In [None]:
df = distance_calculation(df)

# EDA

In [None]:
plt.scatter(y=df['fare_amount'], x=df['distance'])

In [None]:
df.groupby('passenger_count')['fare_amount'].mean().plot(kind='bar')

In [None]:
df.groupby([
       'pickup_dayofweek_1', 
       'pickup_dayofweek_2', 
       'pickup_dayofweek_3',
       'pickup_dayofweek_4', 
       'pickup_dayofweek_5', 
       'pickup_dayofweek_6'
       ])['fare_amount'].mean().plot(kind='bar', figsize=(10, 5))

In [None]:
df.groupby('pickup_hour')['fare_amount'].mean().plot(kind='bar', figsize=(10, 5))

## Final feature list and the ft engineering pipeline 


In [None]:
# Defining the final feature list 
numeric_features = [
    'distance',
    'passenger_count', 
    'pickup_hour_sin',
    'pickup_hour_cos',
    'pickup_dayofyear_sin',
    'pickup_dayofyear_cos',
]

# Defining the target variable
target = 'fare_amount'

# Defining the ft engineering pipeline 
def ft_engineering_pipeline(
    df, 
    numeric_features, 
    dummy_features,
    target):
    """
    Applies the feature engineering pipeline to the data
    """
    # Creating the date variables
    df = create_date_vars(df)

    # Creating the dummy variables
    df, new_features = create_dummy(df, dummy_features)

    # Appending the distance
    df = distance_calculation(df) 

    # Appending the new features to the numeric features
    final_features = numeric_features + new_features

    # Creating the x matrix 
    x = df[final_features].values

    # Creating the y vector
    y = df[target].values

    # Mean max scaling the y matrix 
    y = y.reshape(-1, 1)
    scaler = MinMaxScaler()
    y = scaler.fit_transform(y)

    # Returning the x and y matrices
    return x, y, final_features

# Creating the input for model 

In [None]:
x, y, features = ft_engineering_pipeline(df, numeric_features, dummy_features, target)

In [None]:
print(f"Shape of x: {x.shape} | Shape of y: {y.shape}")
print(features)

# Deep learning 

In [None]:
# Defining the batch size and number of epochs 
batch_size = 512
epochs = 10

In [None]:
# Defining the model function 
def train(x, y, epochs: int = 10, batch_size: int = 128): 
    # Defining a simple feed forward network 
    model = keras.Sequential([
        keras.layers.Dense(128, activation=tf.nn.relu, input_shape=(x.shape[1],)),
        keras.layers.Dense(128, activation=tf.nn.relu),
        keras.layers.Dense(1)
    ])

    # Compiling the model
    model.compile(
        optimizer='adam',
        loss='mean_squared_error',
        metrics=['mean_squared_error']
    )

    # Fitting the model
    history = model.fit(x, y, epochs=epochs, batch_size=batch_size)

    # Returning the model
    return model, history

# Auxilary plots 

## RAM usage by the whole dataset method

In [None]:
nrows = [
    10, 
    100, 
    1000,
    2000,
    5000,
    10000,
    20000,
    54000
]

ram_usage = [
    490,
    516,
    772,
    1117,
    1304,
    2554,
    5252,
    12000
]

# Ploting the relationship
plt.figure(figsize=(13, 8))
plt.plot(nrows, ram_usage, '-o')
plt.grid()
plt.xlabel('Number of rows in the training dataset (thousands)')
plt.ylabel('RAM usage (MB)')
plt.title('RAM usage vs number of rows in the training dataset')
plt.show()