# Examining the data

In [34]:
import pandas as pd

# Check november and december
trips = pd.read_parquet('data/trips/green_tripdata_2019-01.parquet')
print(trips.head())

   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0         2  2018-12-21 15:17:29   2018-12-21 15:18:57                  N   
1         2  2019-01-01 00:10:16   2019-01-01 00:16:32                  N   
2         2  2019-01-01 00:27:11   2019-01-01 00:31:38                  N   
3         2  2019-01-01 00:46:20   2019-01-01 01:04:54                  N   
4         2  2019-01-01 00:19:06   2019-01-01 00:39:43                  N   

   RatecodeID  PULocationID  DOLocationID  passenger_count  trip_distance  \
0         1.0           264           264              5.0           0.00   
1         1.0            97            49              2.0           0.86   
2         1.0            49           189              2.0           0.66   
3         1.0           189            17              2.0           2.68   
4         1.0            82           258              1.0           4.53   

   fare_amount  extra  mta_tax  tip_amount  tolls_amount  ehail_fee  \
0  

In [2]:
import pandas as pd

# Check november and december
trips = pd.read_parquet('data/trips/yellow_tripdata_2019-11.parquet')
print(trips)

         VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0               1  2019-11-01 00:30:41   2019-11-01 00:32:25              1.0   
1               1  2019-11-01 00:34:01   2019-11-01 00:34:09              1.0   
2               2  2019-11-01 00:41:59   2019-11-01 00:42:23              1.0   
3               2  2019-11-01 00:02:39   2019-11-01 00:02:51              1.0   
4               2  2019-11-01 00:18:30   2019-11-01 00:18:39              2.0   
...           ...                  ...                   ...              ...   
6878106         2  2019-11-30 23:30:00   2019-11-30 23:43:00              NaN   
6878107         2  2019-11-30 23:49:00   2019-12-01 00:06:00              NaN   
6878108         2  2019-11-30 23:33:59   2019-11-30 23:45:46              NaN   
6878109         2  2019-11-30 23:47:00   2019-11-30 23:51:00              NaN   
6878110         2  2019-11-30 23:05:00   2019-11-30 23:19:00              NaN   

         trip_distance  Rat

In [12]:
# read all zones
zones = pd.read_csv("data/zones/taxi_zone_lookup.csv")
# extract all zone codes
zone_codes = zones[["LocationID"]]
print(zone_codes)

     LocationID
0             1
1             2
2             3
3             4
4             5
..          ...
260         261
261         262
262         263
263         264
264         265

[265 rows x 1 columns]


## List some faults in data

In [114]:
bad_zone_count = 0
bad_trip_count = 0
long_trip_count = 0
bad_fare_amounts = 0
invalid_months = 0

directory = "data/trips"
for file in os.listdir(directory):
    yellow = file.startswith("yellow")
    pickupTime = "tpep_pickup_datetime" if yellow else "lpep_pickup_datetime"
    dropoffTime = "tpep_dropoff_datetime" if yellow else "lpep_dropoff_datetime"
    
    if file.endswith(".parquet") and not file.startswith("all_trips"):
        # print(f"Reading {file}")
        trips = pd.read_parquet(f"{directory}/{file}")

        # rename pickupTime and dropoffTime to "pickup_time" and "dropoff_time" respectively
        trips = trips.rename(columns={pickupTime: "pickup_time", dropoffTime: "dropoff_time"})

        # count how many rows that have PULocationID and DOLocationID as a zone code that is not in the zone_codes
        bad_zone_count += trips[~trips.PULocationID.isin(zone_codes.LocationID)].shape[0] + trips[~trips.DOLocationID.isin(zone_codes.LocationID)].shape[0]

        # count how many trips have distance of 0
        bad_trip_count += trips[trips.trip_distance <= 0].shape[0]

        # count how many trips have distance of > 100
        long_trip_count += trips[trips.trip_distance > 100].shape[0]

        # count how many trips have fare amount of less than 2.50
        bad_fare_amounts += trips[trips.fare_amount < 2.50].shape[0]

        # extract the month from the file name
        month = int(file[-10:-8])

        # count months that does not match with the files month
        invalid_months += trips[trips.pickup_time.dt.month != month].shape[0]
        invalid_months += trips[trips.dropoff_time.dt.month != month].shape[0]

print("Bad zone count:", bad_zone_count)
print("Bad trip count:", bad_trip_count)
print("Long trip count:", long_trip_count)
print("Bad fare amounts:", bad_fare_amounts)
print("Invalid months:", invalid_months)

Bad zone count: 0
Bad trip count: 910108
Large trip count: 344
Bad fare amounts: 253718
Invalid months: 40170


# Clean data

In [54]:
import pandas as pd

def readAndCleanData(filename):

    data = pd.read_parquet(filename)

    # type 1 if yellow, 2 if green
    type = 1 if filename.startswith("data/trips/yellow") else 2
    pickupColumn = "tpep_pickup_datetime" if type == 1 else "lpep_pickup_datetime"
    dropoffColumn = "tpep_dropoff_datetime" if type == 1 else "lpep_dropoff_datetime"

    data = data.rename(columns={pickupColumn: "pickup_time", dropoffColumn: "dropoff_time"})

    data = data[data.get("fare_amount") > 2.5]
    data = data[['pickup_time','dropoff_time','trip_distance','PULocationID','DOLocationID']]
    data['pickup_time'] = pd.to_datetime(data['pickup_time'])
    data['dropoff_time'] = pd.to_datetime(data['dropoff_time'])
    data = data.drop(data[data['pickup_time'] >= data['dropoff_time']].index)

    data['trip_time'] = (data['dropoff_time']-data['pickup_time']).dt.total_seconds()
    
    # data['pickup_time'] = (data['pickup_time']-data['pickup_time'].dt.normalize()).dt.total_seconds()
    # data['dropoff_time'] = (data['dropoff_time']-data['dropoff_time'].dt.normalize()).dt.total_seconds()

    # pickup_month
    data['pickup_month'] = data['pickup_time'].dt.month
    # pickup day
    data['pickup_day'] = data['pickup_time'].dt.day
    # pickup time of day numeric
    data['pickup_time'] = data['pickup_time'].dt.hour * 3600 + data['pickup_time'].dt.minute * 60 + data['pickup_time'].dt.second

    data = data[(data.PULocationID < 264) & (data.PULocationID > 0)]
    data = data[(data.DOLocationID < 264) & (data.DOLocationID > 0)]
    data = data[data.trip_time != 0]
    data = data[data.trip_time < 20000]
    data = data[data.trip_distance != 0.0]

    # remove dropoff_time column
    data = data.drop(columns=["dropoff_time"])

    return data

trips = readAndCleanData("data/trips/yellow_tripdata_2019-12.parquet")
print(f"Yellow trip data: \n{trips.dtypes}\n \n{trips}\n")

Yellow trip data: 
pickup_time        int64
trip_distance    float64
PULocationID       int64
DOLocationID       int64
trip_time        float64
pickup_month       int64
pickup_day         int64
dtype: object
 
         pickup_time  trip_distance  PULocationID  DOLocationID  trip_time  \
0               1618           4.20           142           116      887.0   
3                723           9.40           138            25     1276.0   
4                327           1.60           161           237      665.0   
5               3531           1.00           161           230      586.0   
6                859           1.70           164           163      767.0   
...              ...            ...           ...           ...        ...   
6896312        86189           2.82           143           141      888.0   
6896313        83513           3.75           148           246     1143.0   
6896314        86241           6.46           197           205     1573.0   
6896315   

In [55]:
import os

# read all parquet files into one pandas dataframe
trips = pd.DataFrame()
directory = "data/trips"
for file in os.listdir(directory):
    # skip if data is from november and december
    # if file.endswith("2019-11.parquet") or file.endswith("2019-12.parquet") or file.startswith("all_trips"):
    if file.startswith("all_trips"):
        continue

    if file.endswith(".parquet"):
        print(f"Reading {file}")
        trips = pd.concat([trips, readAndCleanData(f"{directory}/{file}")])

# Preview
print(trips)

# save all trips into one parquet
print(f"Saving all trips into one parquet")
trips.to_parquet("data/trips/all_trips.parquet")
print(f"Saved")

Reading green_tripdata_2019-01.parquet
Reading green_tripdata_2019-02.parquet
Reading green_tripdata_2019-03.parquet
Reading green_tripdata_2019-04.parquet
Reading green_tripdata_2019-05.parquet
Reading green_tripdata_2019-06.parquet
Reading green_tripdata_2019-07.parquet
Reading green_tripdata_2019-08.parquet
Reading green_tripdata_2019-09.parquet
Reading green_tripdata_2019-10.parquet
Reading green_tripdata_2019-11.parquet
Reading green_tripdata_2019-12.parquet
Reading yellow_tripdata_2019-01.parquet
Reading yellow_tripdata_2019-02.parquet
Reading yellow_tripdata_2019-03.parquet
Reading yellow_tripdata_2019-04.parquet
Reading yellow_tripdata_2019-05.parquet
Reading yellow_tripdata_2019-06.parquet
Reading yellow_tripdata_2019-07.parquet
Reading yellow_tripdata_2019-08.parquet
Reading yellow_tripdata_2019-09.parquet
Reading yellow_tripdata_2019-10.parquet
Reading yellow_tripdata_2019-11.parquet
Reading yellow_tripdata_2019-12.parquet
         pickup_time  trip_distance  PULocationID  D

# Prepare training of the model

## Read all data from combined parquet file and split training and test sets


### Local version

In [25]:
import pandas as pd, numpy as np

# Read all_trips.parquet
trips = pd.read_parquet("data/trips/all_trips.parquet")

# trips[trips.select_dtypes(np.int64).columns] = trips.select_dtypes(np.int64).astype(np.float32)
# trips[trips.select_dtypes(np.float64).columns] = trips.select_dtypes(np.float64).astype(np.float32)

# convert trip_time to int
# trips["trip_time"] = trips["trip_time"].astype(int)

print(trips.dtypes)
print(trips.shape)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    trips[["pickup_month", "pickup_day", "pickup_time", "trip_distance", "PULocationID", "DOLocationID"]],
    trips[["trip_time"]],
    train_size=0.01,
    test_size=0.0025
    # test_size=0.33
    # test_size=0.25
)

del trips

# x_train = x_train.values.ravel()
# y_train = y_train.values.ravel()
# x_test = x_test.values.ravel()
# y_test = y_test.values.ravel()

# print(x_train)
# print(y_train)
# print(x_test)
# print(y_test)

pickup_time        int64
trip_distance    float64
PULocationID       int64
DOLocationID       int64
trip_time        float64
pickup_month       int64
pickup_day         int64
dtype: object
(88439287, 7)


### Colab version

In [6]:
# Mount Google Drive for saving model files to
from google.colab import drive
drive.mount('/content/gdrive')
gdrive = "/content/gdrive/My Drive/ML"

import pandas as pd, numpy as np

# Read all_trips.parquet
trips = pd.read_parquet("/content/gdrive/My Drive/ML/all_trips.parquet")

# # get ["pickup_time", "trip_distance", "PULocationID", "DOLocationID"] from trips as X values
# x_values = trips[["pickup_time", "trip_distance", "PULocationID", "DOLocationID"]].to_numpy()
# print(f"{len(x_values)} X values: \n{x_values}\n")

# # get ["trip_time"] from trips as y values
# y_values = trips[["trip_time"]].to_numpy()#.ravel()
# print(f"{len(y_values)} Y values: \n{y_values}\n")

trips = trips.iloc[: , 1:]

# trips[trips.select_dtypes(np.int64).columns] = trips.select_dtypes(np.int64).astype(np.float32)
# trips[trips.select_dtypes(np.float64).columns] = trips.select_dtypes(np.float64).astype(np.float32)

# remove all rows with months november and december
trips = trips[trips.pickup_time.dt.month != 11]
trips = trips[trips.pickup_time.dt.month != 12]

print(trips.dtypes)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    trips[["pickup_time", "trip_distance", "PULocationID", "DOLocationID"]],
    trips[["trip_time"]],
    train_size=0.001,
    test_size=0.0001
)

del trips

# x_train = x_train.values.ravel()
# y_train = y_train.values.ravel()
# x_test = x_test.values.ravel()
# y_test = y_test.values.ravel()

# print(x_train)
# print(y_train)
# print(x_test)
# print(y_test)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
pickup_time        int64
trip_distance    float64
PULocationID       int64
DOLocationID       int64
trip_time        float64
dtype: object


# Training the model and finding the best parameters
The data was perceived to be most well suited for regression.
Due to time and computational constraints, some models were trained on a limited set of the data.
The best model chosen for the API ended up being the Multi-layer Perceptron (MLP) regression model.

## Linear Regression

In [26]:
from sklearn import linear_model

linReg = linear_model.LinearRegression()
linReg.fit(x_train, y_train)
print(f"Score: {linReg.score(x_test, y_test)}")

# random index between 0 and len(x_test)
sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", linReg.predict(sample_x), "Actual:", sample_y)

Score: 0.6276667577253036
Prediction: [[537.62681455]] Actual: 786.0


## Ridge Regression

In [27]:
ridgeReg = linear_model.RidgeCV()
ridgeReg.fit(x_train, y_train)
print(f"Score: {ridgeReg.score(x_test, y_test)}")

sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", ridgeReg.predict(sample_x)[0][0], "Actual:", sample_y)

Score: 0.6275057234564818
Prediction: 704.155424207627 Actual: 472.00000000000006


## ElasticNetCV

In [9]:
enReg = linear_model.ElasticNetCV(n_jobs=-1, verbose=1)
enReg.fit(x_train, y_train)
print(f"Score: {enReg.score(x_test, y_test.values.ravel())}")

sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", enReg.predict(sample_x), "Actual:", sample_y)

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Score: 0.04450204675955771
Prediction: [882.73458109] Actual: 516.0


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


In [3]:
from sklearn import linear_model
menReg = linear_model.MultiTaskElasticNetCV(n_jobs=-1, verbose=1)
menReg.fit(x_train, y_train)
print(f"Score: {menReg.score(x_test, y_test)}")

sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", menReg.predict(sample_x), "Actual:", sample_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
...................................................................................................................................................................................................................................................................................................................

## KNeighborsRegressor
This was too slow and it was not possible to train within the time and computation constraints.

In [6]:
from sklearn import neighbors
knReg = neighbors.KNeighborsRegressor(n_jobs=-1)
knReg.fit(x_train, y_train)
print(f"Score: {knReg.score(x_test, y_test)}")

sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", knReg.predict(sample_x), "Actual:", sample_y)

## Poisson Regression

In [None]:
from sklearn import linear_model
menReg = linear_model.PoissonRegressor(verbose=1)
menReg.fit(x_train, y_train)
print(f"Score: {menReg.score(x_test, y_test)}")

sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", menReg.predict(sample_x), "Actual:", sample_y)

## Multi-layer Perceptron Regression

In [28]:
from sklearn import neural_network
mlpReg = neural_network.MLPRegressor(
    verbose=1,
    learning_rate='adaptive',
    # max_iter=100,
    learning_rate_init=0.001,
    alpha=0.001,
)
mlpReg.fit(x_train, y_train.values.ravel())
print(f"Score: {mlpReg.score(x_test, y_test.values.ravel())}")

sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", mlpReg.predict(sample_x), "Actual:", sample_y)

Iteration 1, loss = 603902.23649265
Iteration 2, loss = 114575.46234489
Iteration 3, loss = 108612.96321159
Iteration 4, loss = 107229.74394620
Iteration 5, loss = 108529.61181290
Iteration 6, loss = 106795.44123530
Iteration 7, loss = 104867.11804955
Iteration 8, loss = 108215.66058823
Iteration 9, loss = 107493.96079316
Iteration 10, loss = 104902.14337044
Iteration 11, loss = 104808.91324947
Iteration 12, loss = 103377.79043832
Iteration 13, loss = 107880.52672698
Iteration 14, loss = 105032.30673870
Iteration 15, loss = 105573.82625707
Iteration 16, loss = 104894.59142544
Iteration 17, loss = 102270.74201284
Iteration 18, loss = 101082.01679041
Iteration 19, loss = 98707.32115689
Iteration 20, loss = 100762.65509327
Iteration 21, loss = 97588.84205214
Iteration 22, loss = 103801.77537778
Iteration 23, loss = 102802.22701319
Iteration 24, loss = 103817.21683867
Iteration 25, loss = 103824.66804022
Iteration 26, loss = 102855.91723405
Iteration 27, loss = 98875.21886620
Iteration 28,

In [31]:
sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", mlpReg.predict(sample_x), "Actual:", sample_y)


Prediction: [601.69966873] Actual: 457.00000000000006


In [33]:
# Save the model into file
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(mlpReg, file)

## SGDRegressor

In [10]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

sgdReg = linear_model.SGDRegressor(
    learning_rate='adaptive',
    # eta0=0.005,
    # fit_intercept=True,
    # shuffle=False,
    tol=0.0,
    # penalty='none',
    verbose=5,
)
sgdReg.fit(x_train, y_train.values.ravel())
sgdReg.score(x_test, y_test.values.ravel())

# random index between 0 and len(x_test)
sample_index = np.random.randint(0, len(x_test))
sample_x = x_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]["trip_time"]
print("Prediction:", sgdReg.predict(sample_x)[0], "Actual:", sample_y)

-- Epoch 1
Norm: 166359269140685.22, NNZs: 6, Bias: 929253359007.812012, T: 88439, Avg. loss: 209928981852802224863827726776559206400.000000
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 344503627844128.25, NNZs: 6, Bias: 1519253359007.812012, T: 176878, Avg. loss: 210386830200109396387626688519126122496.000000
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 488647064581890.19, NNZs: 6, Bias: 1449253359007.812012, T: 265317, Avg. loss: 210495233601530957594338919881428172800.000000
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 770074021668948.50, NNZs: 6, Bias: 119253359007.812012, T: 353756, Avg. loss: 210335124516733050429285075277693583360.000000
Total training time: 0.06 seconds.
-- Epoch 5
Norm: 436936759733664.00, NNZs: 6, Bias: 129253359007.812012, T: 442195, Avg. loss: 211172396561981067723942919729888362496.000000
Total training time: 0.07 seconds.
-- Epoch 6
Norm: 209261755928090.34, NNZs: 6, Bias: -80746640992.187988, T: 530634, Avg. loss: 2103721735799127881

## Random Forest
Too time and computationally expensive to train

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

ranForReg = RandomForestRegressor(n_jobs=-1)
ranForReg.fit(x_train, y_train.values.ravel())
print(f"{ranForReg.score(x_test, y_test)}")
print("Prediction:", ranForReg.predict(x_test.head(1))[0], "Actual:", y_test.iloc[0]['trip_time'])

### Finding best depth for random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

maxScore = 0
maxDepth = 1
depthLimit = 12
estimatorsLimit = 1000

for x in range(2, depthLimit):
  # for y in range(2, estimatorsLimit):
  ranForRegOpt = RandomForestRegressor(max_depth=x, random_state=0, n_jobs=-1)
  ranForRegOpt.fit(x_train, y_train.values.ravel())
  score = ranForRegOpt.score(x_test, y_test)
  if score > maxScore:
    maxScore = score
    maxDepth = x

print(f"{maxDepth}: {maxScore}")
print("Prediction:", ranForReg.predict(x_test.head(1)), "Actual:", y_test.iloc[0]['trip_time'])

# Misc testing when training

## PyTorch

In [None]:
import torch
from torch.autograd import Variable
import numpy as np

x_train = np.array(x_train, dtype=np.float32)
# x_train = x_train.reshape(-1, 1)

y_train = np.array(y_train, dtype=np.float32)
# y_train = y_train.reshape(-1, 1)

print(x_train.shape)
print(y_train.shape)

class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out

inputDim = 1        # takes variable 'x' 
outputDim = 1       # takes variable 'y'
learningRate = 0.01 
epochs = 100

model = linearRegression(inputDim, outputDim)
##### For GPU #######
if torch.cuda.is_available():
    model.cuda()

criterion = torch.nn.MSELoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

for epoch in range(epochs):
    # Converting inputs and labels to Variable
    if torch.cuda.is_available():
        inputs = Variable(torch.from_numpy(x_train).cuda())
        labels = Variable(torch.from_numpy(y_train).cuda())
    else:
        inputs = Variable(torch.from_numpy(x_train))
        labels = Variable(torch.from_numpy(y_train))

    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    print(loss)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()

    print('epoch {}, loss {}'.format(epoch, loss.item()))

(80987946, 4)
(80987946, 1)


RuntimeError: ignored

## CUML
Together with the GPU instances provided from Google Colab sessions, some training with GPU was tested to see whether that would yield in faster computation.
Neural Networks would perhaps benefit the most from the GPU however there was not enough time to test that.

In [None]:
!nvidia-smi

Mon Jun  6 14:38:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 300, done.[K
remote: Counting objects: 100% (129/129), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 300 (delta 74), reused 99 (delta 55), pack-reused 171[K
Receiving objects: 100% (300/300), 87.58 KiB | 8.76 MiB/s, done.
Resolving deltas: 100% (136/136), done.
***********************************************************************
Woo! Your instance has the right kind of GPU, a Tesla T4!
***********************************************************************



In [None]:
# This will update the Colab environment and restart the kernel.  Don't run the next cell until you see the session crash.
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)

Updating your Colab environment.  This will restart your kernel.  Don't Panic!
Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:5 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2,799 kB]
Get:13 http:

In [None]:
# This will install CondaColab.  This will restart your kernel one last time.  Run this cell by itself and only run the next cell once you see the session crash.
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...


In [None]:
# you can now run the rest of the cells as normal
import condacolab
condacolab.check()

✨🍰✨ Everything looks OK!


In [3]:
# Installing RAPIDS is now 'python rapidsai-csp-utils/colab/install_rapids.py <release> <packages>'
# The <release> options are 'stable' and 'nightly'.  Leaving it blank or adding any other words will default to stable.
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'

Found existing installation: cffi 1.14.5
Uninstalling cffi-1.14.5:
  Successfully uninstalled cffi-1.14.5
Found existing installation: cryptography 3.4.5
Uninstalling cryptography-3.4.5:
  Successfully uninstalled cryptography-3.4.5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cffi==1.15.0
  Downloading cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (427 kB)
Installing collected packages: cffi
Successfully installed cffi-1.15.0
Installing RAPIDS Stable 21.12
Starting the RAPIDS install on Colab.  This will take about 15 minutes.
Collecting package metadata (current_repodata.json): ...working... done
failed with initial frozen solve. Retrying with flexible solve.
failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): ...working... done
done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
   

In [None]:
import pandas as pd
trips = pd.read_parquet("/content/gdrive/My Drive/ML/all_trips.parquet")
trips2 = pd.read_parquet("/content/gdrive/My Drive/ML/all_trips.parquet")

In [9]:
del trips

In [10]:
del x_train, x_test, y_train, y_test

In [1]:
# Mount Google Drive for saving model files to
from pathlib import Path
from google.colab import drive
drive.mount('/content/gdrive')
gdrive = "/content/gdrive/My Drive/ML"

import cuml, cudf, pandas as pd, numpy as np, cupy as cp
trips = cudf.read_parquet("/content/gdrive/My Drive/ML/all_trips.parquet")

trips = trips.iloc[: , 1:]

trips[trips.select_dtypes(np.int64).columns] = trips.select_dtypes(np.int64).astype(np.float32)
trips[trips.select_dtypes(np.float64).columns] = trips.select_dtypes(np.float64).astype(np.float32)
print(trips.dtypes)

x_train, x_test, y_train, y_test = cuml.model_selection.train_test_split(
    trips,
    'trip_time',
    train_size=0.01,
    test_size=0.001,
)

del trips

print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"y_test: {x_test.shape}, y_test: {y_test.shape}")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
pickup_time      float32
trip_distance    float32
PULocationID     float32
DOLocationID     float32
trip_time        float32
dtype: object
x_train: (899866, 4), y_train: (899866,)
y_test: (89986, 4), y_test: (89986,)


In [11]:
model = cuml.LinearRegression(fit_intercept=True, normalize=True, verbose=True)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.021855056285858154

In [12]:
model = cuml.ensemble.RandomForestRegressor(verbose=True)
model.fit(x_train, y_train)
model.score(x_test, y_test)
# import pickle
# pickle.dump(model, open("/content/gdrive/My Drive/ML/ranForReg", 'wb'))

[D] [16:35:15.715236] cuml/common/logger.cpp:3152 Expected column ('F') major order, but got the opposite. Converting data, this will result in additional memory utilization.
[D] [16:35:15.715913] cuml/common/logger.cpp:3152 Expected column ('F') major order, but got the opposite. Converting data, this will result in additional memory utilization.


-0.050472140312194824

In [13]:
model = cuml.linear_model.Lasso(verbose=True)
model.fit(x_train, y_train)
model.score(x_test, y_test)

[D] [16:35:15.743773] cuml/common/logger.cpp:3152 Expected column ('F') major order, but got the opposite. Converting data, this will result in additional memory utilization.


0.021855950355529785

In [4]:
model = cuml.svm.LinearSVR()
model.fit(x_train, y_train)
model.score(x_test, y_test)

[W] [15:28:59.482750] L-BFGS line search failed (code 1); stopping at the last valid step


-0.06486129760742188

In [6]:
model = cuml.linear_model.Ridge()
model.fit(x_train, y_train)
model.score(x_test, y_test)

nan

In [10]:
model = cuml.solvers.SGD(
    learning_rate='constant',
    eta0=0.005,
    epochs=2,
    fit_intercept=True,
    batch_size=1000,
    shuffle=False,
    tol=0.0,
    penalty='none',
    loss='squared_loss',
    verbose=5,
)
model.fit(x_train, y_train)
# model.score(x_test, y_test)
p = model.predict(x_test[0:2])
# p = model.predict(cp.array(x_train.as_gpu_matrix()))
print(p)
# print(y_test[0:2])
# print(y_test.shape)
# print(x_test[0:2])
# print(f"Prediction: {p}, Expected: {y_test[0]}")

55903593    <NA>
14879120    <NA>
dtype: float32


In [None]:
model = cuml.linear_model.MBSGDRegressor(
    learning_rate='constant',
    eta0=0.005,
    epochs=2,
    fit_intercept=True,
    batch_size=1000,
    shuffle=False,
    tol=0.0,
    penalty='none',
    loss='squared_loss',
    verbose=5,
)
model.fit(x_train, y_train)
model.score(x_test, y_test)
# import pickle
# pickle.dump(model, open("/content/gdrive/My Drive/ML/mbsgd", 'wb'))