# ## Laboration 3 - Travel Time Service
Made by: Robin Jamsahar, Mohamed Osman
TIDAA3 - KTH
2022-06-10

In [502]:
#pip install fastparquet
#pip install pyarrow

In [503]:
# import all the necessary packages
import pyarrow.parquet as pq # for reading parquet files
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read.parquet)
from pathlib import Path # to manipulate paths
import datetime as dt # for datetime objects

# ## Cleaning the data
Here we clean the data. We have a data parameter that is a dataframe, and a type indicating the type of file it was(Either green or yellow datafile)
then we set a few rules for what is a good data point. Lastly we return a cleaned dataframe.

In [504]:
def clean_data(df,type):
    # print("Cleaning"f"{df}", "with the type", type)
    # type == 1 is yellow data
    if type == 1:
        df = df[df.get("fare_amount")>2.5] # filter out the data that is less than 2.5$. 2.5$ is the minimum fare for a taxi
        df = df[['tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance','PULocationID','DOLocationID']] # only keep the columns we need

        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime']) # convert to datetime
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime']) # convert to datetime

        df = df.drop(df[df['tpep_pickup_datetime'] >= df['tpep_dropoff_datetime']].index) #drop rows where pickup time is after dropoff time
        df['trip_time'] = (df['tpep_dropoff_datetime']-df['tpep_pickup_datetime']).dt.total_seconds() #calculate trip time

        df['tpep_pickup_datetime'] = (df['tpep_pickup_datetime']-df['tpep_pickup_datetime'].dt.normalize()).dt.total_seconds() #normalize() makes it so that the time is in seconds since midnight
        df['tpep_dropoff_datetime'] = (df['tpep_dropoff_datetime']-df['tpep_dropoff_datetime'].dt.normalize()).dt.total_seconds()  #normalize() makes it so that the time is in seconds since midnight
        df.rename(columns = {'tpep_pickup_datetime':'pickup_time','tpep_dropoff_datetime':'dropoff_time'},inplace = True) #rename columns

        df = df[(df.PULocationID < 264) & (df.PULocationID > 0)] #only keep locations within the city
        df = df[(df.DOLocationID < 264) & (df.DOLocationID > 0)]
        df = df[df.trip_time != 0] #remove trips that are 0 seconds
        df = df[df.trip_time < 20000] #remove trips that are longer than 5 and a half hours
        df = df[df.trip_distance != 0.0] #remove trips that are 0 miles
        df.info() #print out some info about the dataframe
        df.describe()

    # type == 2 is green data
    if type == 2:
        df = df[df.get("fare_amount")>2.5] # remove rows with fare_amount < 2.5$. 2.5$ is the minimum fare for a taxi
        df = df[['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','PULocationID','DOLocationID']] # remove columns that are not needed
        df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime']) # convert to datetime
        df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime']) # convert to datetime

        df = df.drop(df[df['lpep_pickup_datetime'] >= df['lpep_dropoff_datetime']].index) #drop rows where pickup time is after dropoff time
        df['trip_time'] = (df['lpep_dropoff_datetime']-df['lpep_pickup_datetime']).dt.total_seconds() #calculate trip time

        df['lpep_pickup_datetime'] = (df['lpep_pickup_datetime']-df['lpep_pickup_datetime'].dt.normalize()).dt.total_seconds() #normalize() makes it so that the time is in seconds since midnight
        df['lpep_dropoff_datetime'] = (df['lpep_dropoff_datetime']-df['lpep_dropoff_datetime'].dt.normalize()).dt.total_seconds() #normalize() makes it so that the time is in seconds since midnight
        df.rename(columns = {'lpep_pickup_datetime':'pickup_time','lpep_dropoff_datetime':'dropoff_time'},inplace = True) # rename columns

        df = df[(df.PULocationID < 264) & (df.PULocationID > 0)] #only keep locations within the city
        df = df[(df.DOLocationID < 264) & (df.DOLocationID > 0)]
        df = df[df.trip_time != 0] #remove trips that are 0 seconds
        df = df[df.trip_time < 20000] #remove trips that are longer than 5 and a half hours
        df = df[df.trip_distance != 0.0] #remove trips that are 0 miles
        df.info() #print out some info about the dataframe
        df.describe() #print out some info about the dataframe
    return df #return the cleaned dataframe

In [505]:
def missing_cols(df): #function to check for missing columns
    '''prints out columns with its amount of missing values'''
    total = 0
    for col in df.columns: #for each column in the dataframe
        missing_vals = df[col].isnull().sum() #find the amount of missing values in that column
        total += missing_vals #add the amount of missing values to the total
        pct = df[col].isna().mean() * 100 #find the percentage of missing values in that column
        if missing_vals != 0: #if there are missing values
            print(f"{col} => {df[col].isnull().sum()},{round(pct,2)}%") #print the column name, amount of missing values, and percentage of missing values

    if total == 0: #if there are no missing values
        print("no missing values left") #if there are no missing values left

# ## Train Test Split
Here we first implement necessary packets. Then we copy our dataframe and normalize its data.
Then we train our model on the training data and test it on the testing data. We use the train_test_split function from sklearn.model_selection.
We implement some metrics to evaluate our model and see how well it does. Lastly, print out error metrics (MSE, RMSE, etc.), Accuracy and return our Linear regression object that will be used in the API in later stages.
The only algorithm reasonable to use for this amount of data were Linear Regression. Otherwise, it would take too long to run. Other algorithms were used such as Random Forest Regressor,Logarithmic Regression, SVC however these took unreasonable amount of time to run and the latter two could not realized. This is because they only work for classifications and not regressions. The Random Forest Regressor was tested out on only two files and it gave pretty good accuracy but only gave a fraction more in percentage points and took minutes of training.

In [506]:
from sklearn import preprocessing
from sklearn import utils
def train(df):
    from sklearn.model_selection import train_test_split #split data into training and testing sets
    from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error#calculate the mean squared error and r2 score
    from sklearn.linear_model import LinearRegression,LogisticRegressionCV,LogisticRegression #import Linear Regression
    from sklearn import metrics #import metrics
    import matplotlib.pyplot as plt #import matplotlib
    import matplotlib as mpl
    from sklearn.svm import SVC #import SVC
    from sklearn.datasets import make_classification
    from sklearn.metrics import RocCurveDisplay, roc_auc_score,roc_curve #import roc curve and auc
    from sklearn.tree import DecisionTreeClassifier

    copy = df.copy() #copy the dataframe
    copy["pickup_time"] = (df[("pickup_time")] - df[("pickup_time")].mean()) / df[("pickup_time")].std() #normalize the pickup time
    copy["trip_distance"] = (df[("trip_distance")] - df[("trip_distance")].mean()) / df[("trip_distance")].std() #normalize the trip distance
    copy["PULocationID"] = (df[("PULocationID")] - df[("PULocationID")].mean()) / df[("PULocationID")].std() #normalize the PULocationID
    copy["DOLocationID"] = (df[("DOLocationID")] - df[("DOLocationID")].mean()) / df[("DOLocationID")].std() #normalize the DOLocationID


    #code = diabetes["Code"]
    copy["trip_time"] = (df[("trip_time")] - df[("trip_time")].mean()) / df[("trip_time")].std()#normalize the trip time

    X = np.asarray(copy[["pickup_time","trip_distance","PULocationID","DOLocationID"]]) #create the X values
    Y = np.asarray(copy["trip_time"]) #set the y values to the trip time

    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.33,random_state = 1) #split the data into training and testing sets


    #Linear Regression
    lr = LinearRegression() #create a Linear Regression object

    lr.fit(X_train,Y_train) #fit the Linear Regression object to the training data
    Y_pred = lr.predict(X_test) #predict the test data
    print("Linear Regression:") #print the name of the algorithm
    print("Accuracy: ",r2_score(Y_test, Y_pred)) #calculate the r2 score Accuracy
    print("MSE: ",mean_squared_error(Y_test, Y_pred)) #calculate the mean squared error
    print("MAE: ",mean_absolute_error(Y_test, Y_pred)) #calculate the mean absolute error
    MSE = mean_squared_error(Y_test, Y_pred)
    RMSE = np.sqrt(MSE) #calculate the root mean squared error
    print("RMSE: ",RMSE) #print the root mean squared error

    return lr #return the Linear Regression object


# ## File iteration
In this segment of code we iterate through the files in the directory and create a dataframe for each file. We then clean the dataframe and add it to a list of dataframes.

In [507]:
import os #import the os module
from tqdm import tqdm #import tqdm to show progress of the loop
from IPython.display import clear_output #import clear_output to clear the output of the loop
directory = 'data/trips' #set the directory

# iterate over files in
# that directory
full_df = pd.DataFrame() #create a dataframe to store the data
for filename in tqdm(os.listdir(directory)): #iterate over the files in the directory
    # if type of file is yellow_tripdata_2019-xx.parquet
    if filename.startswith('yellow_tripdata_2019-'): #if the file starts with the string 'yellow_tripdata_2019-'
        # read the file
        df = pd.read_parquet(f"{directory}/{filename}") #read the file
        df = clean_data(df,1) #clean the data
        # append to the full dataframe
        full_df = full_df.append(df)
    if filename.startswith('green_tripdata_2019-'): #if the file is green_tripdata_2019-xx.parquet
        # read the file
        df = pd.read_parquet(f"{directory}/{filename}") #read the file
        df = clean_data(df,2) #clean the data
        # append to the full dataframe
        full_df = full_df.append(df) #append the dataframe to the full dataframe
    clear_output(wait=True) #clear the output of the loop

100%|██████████| 2/2 [00:12<00:00,  6.41s/it]


In [508]:
display(full_df) #print the full dataframe
missing_cols(full_df) #print the missing columns

Unnamed: 0,pickup_time,dropoff_time,trip_distance,PULocationID,DOLocationID,trip_time
1,616.0,992.0,0.86,97,49,376.0
2,1631.0,1898.0,0.66,49,189,267.0
3,2780.0,3894.0,2.68,189,17,1114.0
4,1146.0,2383.0,4.53,82,258,1237.0
5,755.0,1149.0,1.05,49,17,394.0
...,...,...,...,...,...,...
7696609,83400.0,84480.0,9.38,226,42,1080.0
7696610,83820.0,85140.0,6.73,136,51,1320.0
7696611,83640.0,84540.0,5.45,41,136,900.0
7696613,84480.0,85850.0,12.43,48,213,1370.0


no missing values left


In [509]:
full_df.drop("dropoff_time", axis=1, inplace=True)
full_df.dropna(inplace=True) #drop all the NaN values
missing_cols(full_df) #print the missing columns
display(full_df) #print the full dataframe

no missing values left


Unnamed: 0,pickup_time,trip_distance,PULocationID,DOLocationID,trip_time
1,616.0,0.86,97,49,376.0
2,1631.0,0.66,49,189,267.0
3,2780.0,2.68,189,17,1114.0
4,1146.0,4.53,82,258,1237.0
5,755.0,1.05,49,17,394.0
...,...,...,...,...,...
7696609,83400.0,9.38,226,42,1080.0
7696610,83820.0,6.73,136,51,1320.0
7696611,83640.0,5.45,41,136,900.0
7696613,84480.0,12.43,48,213,1370.0


In [510]:
lr = train(full_df) #train the Linear Regression object

Linear Regression:
Accuracy:  0.635517527275048
MSE:  0.36425706392251733
MAE:  0.40977587192386017
RMSE:  0.6035371272113401


# ## Pickle the Linear Regression object
In this segment of code we pickle the Linear Regression object so that we can use it later in our Api.

In [511]:
import pickle #import pickle to save the Linear Regression object
with open('model.pickle', 'wb') as f: #save the Linear Regression object to a file
    pickle.dump(lr, f) #save the Linear Regression object to a file

# ## Conclusion and reflections
In this project we used Linear Regression. This algorithm was used because it was the most efficient and was able to predict the trip time in fairly accurate and in short amount of time. Something that was noticed during the cleaning and training for the months of november and december was that the data was not usable or further cleanable. The data clearly had some issues and brought the accuracy down from a 64% to a low 3% accuracy. Therefore, the data was excluded from the project. As mentioned in the previous section, Linear Regression was the most efficient and accurate algorithm that worked well and was able to predict the trip time in a reasonable amount timeframe. Many models were tried but Linear Regression came out on top.