In [1]:
import numpy as np
import pandas as pd

In [2]:
# Please, to run the experiments donwload the following dataset and put them in the /Dataset folder:
#     - NY_train.csv - 
#       https://www.kaggle.com/c/nyc-taxi-trip-duration/data?select=train.zip
#       (extract the .csv file and rename it to NY_train.csv)

filename = "Datasets/NY_train.csv"
df = pd.read_csv(filename, header=0, usecols=[2,5,6,7,8,10])
df = df.dropna()

print(list(df.columns.values))

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'] , errors='coerce')
df['hour'] = df.pickup_datetime.dt.hour
df['minute'] = df.pickup_datetime.dt.minute
df['pickup_mins_of_the_day'] = df['hour']*60 + df['minute']
df = df.drop(['pickup_datetime', 'hour','minute'], axis=1)
df = df.dropna()

print(df.shape)
print(df.dtypes)

['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'trip_duration']
(1458644, 6)
pickup_longitude          float64
pickup_latitude           float64
dropoff_longitude         float64
dropoff_latitude          float64
trip_duration               int64
pickup_mins_of_the_day      int64
dtype: object


In [3]:
x = df[['pickup_mins_of_the_day','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']]
x = x.to_numpy()
y = df['trip_duration']
y = y.to_numpy()

np.shape(x)

(1458644, 5)

In [4]:
import CaGD_ls

x = np.ascontiguousarray(x)
print(x.shape)
x = CaGD_ls.tens_pow(x,3) 
print(x.shape)

(1458644, 5)
(1458644, 155)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.999, svd_solver='full')
pca.fit(x_scaled)
x_scaled_pca = pca.transform(x_scaled)
print(np.shape(x_scaled_pca))
pca.explained_variance_ratio_.sum()

(1458644, 8)


1.0

In [None]:
x_scaled_pca = CaGD_ls.add_bias(x_scaled_pca)
print(np.shape(x_scaled_pca))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

print(y.std())
print(len(y))
sns.distplot(y, hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
plt.show()

idx = y<=10000
sns.distplot(y[idx], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 2})

plt.show()
print(y[idx].std())
print(len(y[idx]))

In [None]:
x_scaled_pca = x_scaled_pca[idx]
y = y[idx]

In [None]:
lr = 1e-3
block_dim = 2
loss_accepted = 1e-5
max_iter = 5e1
lambda_LASSO = 1e-2
batch_size = 256

n = np.shape(x_scaled_pca)[1]
print("number of parameters ", n)
print("number of points ", x_scaled_pca.shape[0])
theta_0 = np.random.uniform(-1/n**0.5,1/n**0.5,n)

In [None]:
loss_momCA_GS,iteration_momCA_GS,theta_momCA_GS,t_momCA_GS = CaGD_ls.mom_CA_BCD_GS_ls(
                            x_scaled_pca,y,theta_0,lambda_LASSO,lr,loss_accepted,max_iter,2,0.75)

In [None]:
loss_momCA_random,iteration_momCA_random,theta_momCA_random,t_momCA_random = CaGD_ls.mom_CA_BCD_random_ls(
                            x_scaled_pca,y,theta_0,lambda_LASSO,lr,loss_accepted,max_iter,2,0.5)

In [None]:
loss_CABCD_stand_GS,iteration_CABCD_stand_GS,theta_CABCD_stand_GS,t_CABCD_stand_GS = CaGD_ls.CA_BCD_GS_ls(
                             x_scaled_pca,y,theta_0,lambda_LASSO,lr,loss_accepted,max_iter,2,0.75)

In [None]:
loss_ADAM,iteration_ADAM,theta_ADAM,t_ADAM = CaGD_ls.ADAM_ls(x_scaled_pca,y,theta_0,lambda_LASSO,batch_size,lr,loss_accepted,max_iter)

In [None]:
lr = 1e-6
loss_SAG,iteration_SAG,theta_SAG,t_SAG = CaGD_ls.SAG_ls(x_scaled_pca,y,theta_0,lambda_LASSO,batch_size,lr,loss_accepted,max_iter)

In [None]:
print("loss min mom GS CA = ", min(loss_momCA_GS))
print("loss min mom random CA = ", min(loss_momCA_random))
print("loss min standard GS CA = ", min(loss_CABCD_stand_GS))
print("loss min ADAM = ", min(loss_ADAM))
print("loss min SAG = ", min(loss_SAG))

In [None]:
from matplotlib import pyplot as plt

plt.plot(t_momCA_GS, loss_momCA_GS, label="CaBCD mom GS")
plt.plot(t_momCA_random, loss_momCA_random, label="CaBCD mom random")
plt.plot(t_CABCD_stand_GS, loss_CABCD_stand_GS, label="CaBCD GS")
plt.plot(t_ADAM, loss_ADAM, label="ADAM")
plt.plot(t_SAG, loss_SAG, label="SAG")
plt.legend()
plt.title('Loss MSE vs time NY')
plt.xlabel('time')
plt.ylabel('loss')

plt.xlim([0,10])
plt.ylim([min(loss_ADAM)*0.5,max(loss_ADAM)])
# plt.savefig('CaBCD_vs_all_time_NY.pdf', bbox_inches='tight')
plt.show()

plt.plot(iteration_momCA_GS, loss_momCA_GS, label="CaBCD mom GS")
plt.plot(iteration_momCA_random, loss_momCA_random, label="CaBCD mom random")
plt.plot(iteration_CABCD_stand_GS, loss_CABCD_stand_GS, label="CaBCD GS")
plt.plot(iteration_ADAM, loss_ADAM, label="ADAM")
plt.plot(iteration_SAG, loss_SAG, label="SAG")
plt.legend()
plt.title('Loss MSE vs iteration NY')
plt.xlabel('iteration (over the entire dataset)')
plt.ylabel('loss')
plt.xlim([0,30])
plt.ylim([min(loss_ADAM)*0.5,max(loss_ADAM)])
# plt.savefig('CaBCD_vs_all_iteration_NY.pdf', bbox_inches='tight')
plt.show()