In [43]:
#pip install fastparquet
#pip install pyarrow

In [44]:
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
from pathlib import Path
import datetime

First we have to combine our files. These files are of the parquet format.

In [45]:
# data_dir = Path('data/trips/tripdata')
# full_df = pd.concat(
#     pd.read_parquet(parquet_file)
#     for parquet_file in data_dir.glob('*.parquet')
# )
# full_df.to_csv('tripdata.csv')

In [46]:
def clean_data(data,type):
    # print("Cleaning"f"{data}", "with the type", type)
    if type == 1:
        data = data[data.get("fare_amount")>2.5]
        data = data[['tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance','PULocationID','DOLocationID']]

        data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
        data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])

        data = data.drop(data[data['tpep_pickup_datetime'] >= data['tpep_dropoff_datetime']].index)
        data['trip_time'] = (data['tpep_dropoff_datetime']-data['tpep_pickup_datetime']).dt.total_seconds()

        data['tpep_pickup_datetime'] = (data['tpep_pickup_datetime']-data['tpep_pickup_datetime'].dt.normalize()).dt.total_seconds()
        data['tpep_dropoff_datetime'] = (data['tpep_dropoff_datetime']-data['tpep_dropoff_datetime'].dt.normalize()).dt.total_seconds()
        data.rename(columns = {'tpep_pickup_datetime':'pickup_time','tpep_dropoff_datetime':'dropoff_time'},inplace = True)

        data = data[(data.PULocationID < 264) & (data.PULocationID > 0)]
        data = data[(data.DOLocationID < 264) & (data.DOLocationID > 0)]
        data = data[data.trip_time != 0]
        data = data[data.trip_time < 20000]
        data = data[data.trip_distance != 0.0]
        data.info()
        data.describe()

    if type == 2:
        data = data[data.get("fare_amount")>2.5]
        data = data[['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','PULocationID','DOLocationID']]
        data['lpep_pickup_datetime'] = pd.to_datetime(data['lpep_pickup_datetime'])
        data['lpep_dropoff_datetime'] = pd.to_datetime(data['lpep_dropoff_datetime'])

        data = data.drop(data[data['lpep_pickup_datetime'] >= data['lpep_dropoff_datetime']].index)
        data['trip_time'] = (data['lpep_dropoff_datetime']-data['lpep_pickup_datetime']).dt.total_seconds()

        data['lpep_pickup_datetime'] = (data['lpep_pickup_datetime']-data['lpep_pickup_datetime'].dt.normalize()).dt.total_seconds()
        data['lpep_dropoff_datetime'] = (data['lpep_dropoff_datetime']-data['lpep_dropoff_datetime'].dt.normalize()).dt.total_seconds()
        data.rename(columns = {'lpep_pickup_datetime':'pickup_time','lpep_dropoff_datetime':'dropoff_time'},inplace = True)

        data = data[(data.PULocationID < 264) & (data.PULocationID > 0)]
        data = data[(data.DOLocationID < 264) & (data.DOLocationID > 0)]
        data = data[data.trip_time != 0]
        data = data[data.trip_time < 20000]
        data = data[data.trip_distance != 0.0]
        data.info()
        data.describe()
    return data

In [47]:
# def cleanValues(df):
#     df = df[(df.PULocationID < 264) & (df.PULocationID > 0)]
#     df = df[(df.DOLocationID < 264) & (df.DOLocationID > 0)]
#     df = df[df.trip_time != 0]
#     df = df[df.trip_time < 20000]
#     df = df[df.trip_distance != 0.0]
#     return df

In [48]:
def missing_cols(df):
    '''prints out columns with its amount of missing values'''
    total = 0
    for col in df.columns:
        missing_vals = df[col].isnull().sum()
        total += missing_vals
        pct = df[col].isna().mean() * 100
        if missing_vals != 0:
            print(f"{col} => {df[col].isnull().sum()},{round(pct,2)}%")

    if total == 0:
        print("no missing values left")

The only algorithm reasonable to use for this amount of data were Linear Regression. Otherwise it would take too long to run. Other algorithms were used such as Random Forest Regressor, however this took unreasonable amount of time to run.

In [49]:
def train(data):
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, r2_score
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression
    copy = data.copy()
    copy["pickup_time"] = (data[("pickup_time")] - data[("pickup_time")].mean()) / data[("pickup_time")].std()
    copy["trip_distance"] = (data[("trip_distance")] - data[("trip_distance")].mean()) / data[("trip_distance")].std()
    copy["PULocationID"] = (data[("PULocationID")] - data[("PULocationID")].mean()) / data[("PULocationID")].std()
    copy["DOLocationID"] = (data[("DOLocationID")] - data[("DOLocationID")].mean()) / data[("DOLocationID")].std()


    #code = diabetes["Code"]
    copy["trip_time"] = (data[("trip_time")] - data[("trip_time")].mean()) / data[("trip_time")].std()

    X = np.asarray(copy[["pickup_time","trip_distance","PULocationID","DOLocationID"]])
    Y = np.asarray(copy["trip_time"])

    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.33,random_state = 1)


    lr = LinearRegression()
    lr = lr.fit(X_train, Y_train)
    Y_hat = lr.predict(X_test)
    print("Linear Regression:")
    print("Accuracy: ",r2_score(Y_test, Y_hat))
    print("Error: ",mean_squared_error(Y_test, Y_hat))
    return lr


    # print("-----------------------------------------------------")
    # #implement logistic regression
    # lr = LogisticRegression()
    # lr = lr.fit(X_train, Y_train)
    # Y_hat = lr.predict(X_test)
    # print("Logistic Regression:")
    # print("Accuracy: ",r2_score(Y_test, Y_hat))
    # print("Error: ",mean_squared_error(Y_test, Y_hat))

In [50]:

import os
from tqdm import tqdm
from IPython.display import clear_output
directory = 'data/trips'

# iterate over files in
# that directory
full_df = pd.DataFrame()
for filename in tqdm(os.listdir(directory)):
    # f = os.path.join(directory, filename)
    # checking if it is a file
    # if os.path.isfile(f):
    # if type of file is yellow_tripdata_2019-xx.parquet
    if filename.startswith('yellow_tripdata_2019-'):
        # read the file
        df = pd.read_parquet(f"{directory}/{filename}")
        df = clean_data(df,1) #Incomplete
        # df = cleanValues(df)
        # append to the full dataframe
        full_df = full_df.append(df)
    if filename.startswith('green_tripdata_2019-'):
        # read the file
        # print(f"{filename}")
        # print(f"{directory}/{filename}")
        df = pd.read_parquet(f"{directory}/{filename}")
        df = clean_data(df,2) #Incomplete
        # df = cleanValues(df)
        # append to the full dataframe
        full_df = full_df.append(df)
    clear_output(wait=True)

#pd.show_versions()
# ytd = pq.read_table("data/trips/tripdata/yellow_tripdata_2019-01.parquet").to_pandas()
# gtd = pq.read_table("data/trips/tripdata/green_tripdata_2019-01.parquet").to_pandas()
# fhvtd = pq.read_table("data/trips/tripdata/fhv_tripdata_2019-01.parquet").to_pandas()


100%|██████████| 20/20 [02:01<00:00,  6.09s/it]


In [51]:
display(full_df)
missing_cols(full_df)

Unnamed: 0,pickup_time,dropoff_time,trip_distance,PULocationID,DOLocationID,trip_time
1,616.0,992.0,0.86,97,49,376.0
2,1631.0,1898.0,0.66,49,189,267.0
3,2780.0,3894.0,2.68,189,17,1114.0
4,1146.0,2383.0,4.53,82,258,1237.0
5,755.0,1149.0,1.05,49,17,394.0
...,...,...,...,...,...,...
7213886,84000.0,86160.0,19.53,77,119,2160.0
7213887,85320.0,420.0,4.96,238,137,1500.0
7213888,85835.0,525.0,4.63,186,166,1090.0
7213889,84566.0,86355.0,17.49,45,122,1789.0


no missing values left


In [52]:
# cleanValues(full_df)
full_df.dropna(inplace=True)
missing_cols(full_df)
display(full_df)

no missing values left


Unnamed: 0,pickup_time,dropoff_time,trip_distance,PULocationID,DOLocationID,trip_time
1,616.0,992.0,0.86,97,49,376.0
2,1631.0,1898.0,0.66,49,189,267.0
3,2780.0,3894.0,2.68,189,17,1114.0
4,1146.0,2383.0,4.53,82,258,1237.0
5,755.0,1149.0,1.05,49,17,394.0
...,...,...,...,...,...,...
7213886,84000.0,86160.0,19.53,77,119,2160.0
7213887,85320.0,420.0,4.96,238,137,1500.0
7213888,85835.0,525.0,4.63,186,166,1090.0
7213889,84566.0,86355.0,17.49,45,122,1789.0


In [53]:
missing_cols(full_df)
display(full_df)

no missing values left


Unnamed: 0,pickup_time,dropoff_time,trip_distance,PULocationID,DOLocationID,trip_time
1,616.0,992.0,0.86,97,49,376.0
2,1631.0,1898.0,0.66,49,189,267.0
3,2780.0,3894.0,2.68,189,17,1114.0
4,1146.0,2383.0,4.53,82,258,1237.0
5,755.0,1149.0,1.05,49,17,394.0
...,...,...,...,...,...,...
7213886,84000.0,86160.0,19.53,77,119,2160.0
7213887,85320.0,420.0,4.96,238,137,1500.0
7213888,85835.0,525.0,4.63,186,166,1090.0
7213889,84566.0,86355.0,17.49,45,122,1789.0


In [54]:
lr = train(full_df)


Linear Regression:
Accuracy:  0.6325533423972198
Error:  0.3671408139598347


In [58]:
import pickle
with open('model.pickle', 'wb') as f:
    pickle.dump(lr, f)