In [318]:
import pandas as pd
import numpy as np
from scipy.stats import t

In [319]:
def compute_trip_duration(df):
    # Initialize the new df
    result_df = df.copy()
    
    # convert the pickup_datetime and dropoff_datetime to datetime type to use their values to calculate the trip duration
    # here i used pickup_datetime and dropoff_datetime to avoid editing the datatypes in the df
    pickup_datetime = pd.to_datetime(result_df['pickup_datetime'])
    dropoff_datetime = pd.to_datetime(result_df['dropoff_datetime'])
    
    # calculate the trip duration
    result_df['trip duration'] = (dropoff_datetime -  pickup_datetime).dt.total_seconds()

    return result_df

In [320]:
def add_hour_and_day(df):
    # Initialize the new df
    result_df = df.copy()
    
    # convert the pickup_datetime to datetime type to extract the values
    pickup_datetime = pd.to_datetime(result_df['pickup_datetime'])
    
    # a map to map the numbers with day name
    # dayofweek return numbers (0-6) that represents days of the week, Monday=0 to Sunday=6
    days_mapping = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
    }
    
    # Add the hour of the day and the day of the week columns
    result_df['hour'] = pickup_datetime.dt.hour
    result_df['day of week'] = pickup_datetime.dt.dayofweek.map(days_mapping)

    return result_df

In [327]:
def predictions(df):

    data = df.groupby(['pulocationid', 'dolocationid', 'day of week', 'hour'])

    # Calculate mean for each group
    mean_duration = data['trip duration'].mean()
    # Calculate margin of error using 95% confidence interval
          # Calculate the standard error of the mean
    standard_err = data['trip duration'].sem()

    z_score = t.ppf(0.975, df=len(data) - 1)  # 0.975 corresponds to 95% confidence interval
    margin_of_error = z_score * standard_err

    # Create a new DataFrame with the calculated values
    predictions = pd.DataFrame({
        'mean trip duration': mean_duration,
        'margin of error': margin_of_error
    })
    predictions['margin of error'].fillna(0.0, inplace=True)
    return predictions

In [328]:
def generate_predictions():
    
    df = pd.read_csv("https://data.cityofnewyork.us/resource/4p5c-cbgn.csv")
    
    df = compute_trip_duration(df)
    
    df = add_hour_and_day(df)
    
    df = predictions(df)
    return df

In [329]:
generate_predictions()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean trip duration,margin of error
pulocationid,dolocationid,day of week,hour,Unnamed: 4_level_1,Unnamed: 5_level_1
1,145,Wednesday,20,2670.000000,0.000000
4,48,Wednesday,20,2658.000000,0.000000
4,68,Wednesday,20,1664.000000,0.000000
4,74,Wednesday,20,1202.000000,0.000000
4,158,Wednesday,20,1220.000000,0.000000
...,...,...,...,...,...
263,233,Wednesday,20,720.000000,0.000000
263,236,Wednesday,20,229.000000,0.000000
263,237,Wednesday,20,602.000000,0.000000
263,238,Wednesday,20,763.333333,219.205891
