In [1]:
import pandas as pd
import numpy as np
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler,PolynomialFeatures
from keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf
from sklearn.linear_model import LinearRegression
import xgboost as xg
from sklearn.model_selection import GridSearchCV
import time
from sklearn.svm import SVR
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt




In [2]:
region = 478
train_df = pd.read_csv(f'myfiles/csv/{region}.csv', index_col=0)
test_df = pd.read_csv(f'myfiles/csv2/{region}.csv', index_col=0)

results = {}

In [3]:
def rmse(y_true,y_pred):
    return np.sqrt(np.mean(np.square(y_pred-y_true)))

def mape(y_true, y_pred):
    mask = y_true != 0  # Create a mask for non-zero true values
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator))

def getTT(region):
    return round((((region-100)//50) * -5.2222 ) + 147.11),round((((region%50)-2) * 5.2083)+20.458 )

def mean_absolute_percentage_error_revise(y_true, y_pred):
    # RMSE
    rmse = K.sqrt(K.mean(K.square(y_pred - y_true)))

    # SMAPE
    smape_numerator = K.abs(y_pred - y_true)
    smape_denominator = (K.abs(y_pred) + K.abs(y_true)) / 2
    smape = K.mean(smape_numerator / smape_denominator)

    # Combine RMSE and SMAPE with a weight factor (you can adjust this factor)
    alpha = 0.5
    combined_loss = alpha * rmse + (1 - alpha) * smape

    return combined_loss
def custom_metric(y_true, y_pred):
    return mean_absolute_percentage_error_revise(y_true, y_pred)

In [4]:
train = train_df.drop(['freetaxi'], axis=1)
test = test_df.drop(['freetaxi'], axis=1)

In [5]:
# polynomial + scale CSV

# create polynomial features
poly = PolynomialFeatures(2,include_bias=False)
train_poly = pd.DataFrame(poly.fit_transform(train),columns=poly.get_feature_names_out(train.columns))
test_poly = pd.DataFrame(poly.fit_transform(test),columns=poly.get_feature_names_out(test.columns))

# initialise scalers
minmaxscaler = MinMaxScaler(feature_range=(0.1,1))
minmaxscaler.fit(train_poly)
minmaxscalerY = MinMaxScaler(feature_range=(0.1,1))
minmaxscalerY.fit((train[[str(region)]]))

# scale train and test using scalers
scaled_train = pd.DataFrame(minmaxscaler.transform(train_poly),columns=train_poly.columns)
scaled_test = pd.DataFrame(minmaxscaler.transform(test_poly),columns=test_poly.columns)

In [6]:
# Create time series for CSV
## For testing
data = scaled_test.values
scaled_data = data

X2, y2 = [], []
sequence_length = 16  # X = 16 previous time steps

for i in range(len(scaled_data) - sequence_length):
    X2.append(scaled_data[i:i+sequence_length, :])  # Use all columns
    y2.append(scaled_data[i+sequence_length, 0])  # Assuming prediction is based on the first column

X2, y2 = np.array(X2), np.array(y2)

## For training
data = scaled_train.values
scaled_data = data

X, y = [], []
sequence_length = 16  # X = 16 previous time steps

for i in range(len(scaled_data) - sequence_length):
    X.append(scaled_data[i:i+sequence_length, :])  # Use all columns
    y.append(scaled_data[i+sequence_length, 0])  # Assuming prediction is based on the first column

X, y = np.array(X), np.array(y)

In [7]:
train_size = int(len(X) * 0.90)  # 90% training, 10% val
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y[:train_size], y[train_size:]

In [8]:
X_train.shape

(2664, 16, 324)

### MLP

In [9]:
X = X_train
Y = y_train
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(16, 324), name='input_layer'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid', name='output_layer')
])
model.compile(optimizer='adam', loss=mean_absolute_percentage_error_revise, metrics=[custom_metric])

history = model.fit(X, Y, epochs=5, verbose=1)



Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
predictions = model.predict(X2)
print(rmse(y2,predictions))
print(mape(y2,predictions))
print(smape(y2,predictions))

0.07636758848280956
0.2238681245992073
0.24321077042676556


### XG Boost

In [11]:
X_flattened = X_train.reshape((2664, -1))

In [12]:
xgb_r = xg.XGBRegressor(objective ='reg:logistic', 
                  n_estimators = 5, max_depth=3) 
xgb_r.fit(X_flattened, y_train) 

In [13]:
predictions = xgb_r.predict(X2.reshape((2960, -1)))
print(rmse(y2,predictions))
print(mape(y2,predictions))
print(smape(y2,predictions))

0.056896521087173774
0.2898819877940771
0.2620966814403083


### Support Vector Regression

In [14]:
X_flattened = X_train.reshape((2664, -1))

In [15]:
svr_model = SVR(kernel='poly', degree=3, gamma=0.1,C=10,epsilon=0.1) 
svr_model.fit(X_flattened, y_train)

In [16]:
predictions = svr_model.predict(X2.reshape((2960, -1)))
print(rmse(y2,predictions))
print(mape(y2,predictions))
print(smape(y2,predictions))

0.08320697084779863
0.4962923373069024
0.40376288450618497


### ARIMA

In [17]:
arima_train_df = train_df[str(region)]
arima_test_df = test_df[str(region)]
arima_train_df_reshaped = arima_train_df.values.reshape(-1, 1)
arima_test_df_reshaped = arima_test_df.values.reshape(-1, 1)

In [18]:
# initialise scalers
scaler = MinMaxScaler(feature_range=(0.1,1))
scaler.fit(arima_train_df_reshaped)

arima_scaled_train = scaler.transform(arima_train_df_reshaped)
arima_scaled_test = scaler.transform(arima_test_df_reshaped)

In [19]:
p = 1
d = 1
q = 1

model = ARIMA(arima_scaled_train, order=(p, d, q))
model_fit = model.fit()

In [20]:
predictions = model_fit.predict(start=len(arima_scaled_train), end=len(arima_scaled_train) + len(arima_scaled_test) - 1, typ='levels')
print(rmse(arima_scaled_test.flatten(),predictions))
print(mape(arima_scaled_test.flatten(),predictions))
print(smape(arima_scaled_test.flatten(),predictions))

0.06714223792109074
0.2608799294825222
0.26342639483154745


