In [394]:
import pandas as pd
import numpy as np
from scipy.stats import t


In [395]:
def compute_trip_duration(df):
    # Initialize the new df
    result_df = df.copy()
    
    # convert the pickup_datetime and dropoff_datetime to datetime type to use their values to calculate the trip duration
    # here i used pickup_datetime and dropoff_datetime to avoid editing the datatypes in the original df
    pickup_datetime = pd.to_datetime(result_df['pickup_datetime'])
    dropoff_datetime = pd.to_datetime(result_df['dropoff_datetime'])

    # calculate the trip duration
    result_df['trip duration'] = (dropoff_datetime -  pickup_datetime).dt.total_seconds()

    return result_df

In [396]:
def add_hour_and_day(df):
    # Initialize the new df
    result_df = df.copy()
    
    # convert the pickup_datetime to datetime type to extract the values
    pickup_datetime = pd.to_datetime(result_df['pickup_datetime'])
    
    # a map to map the numbers with day name
    # dayofweek return numbers (0-6) that represents days of the week, Monday=0 to Sunday=6
    days_mapping = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
    }
    
    # Add the hour of the day and the day of the week columns
    result_df['hour'] = pickup_datetime.dt.hour
    result_df['day of week'] = pickup_datetime.dt.dayofweek.map(days_mapping)

    return result_df

In [406]:
def predictions(df):

    data = df.groupby(['pulocationid', 'dolocationid', 'day of week', 'hour'])

    # Calculate mean for each group
    mean_duration = data['trip duration'].mean()
    
    # Calculate standard of error
    standard_err = data['trip duration'].sem()
    z_score = t.ppf(0.95, len(data) - 1)
    margin_of_error = standard_err * z_score

    # Create a new DataFrame with the calculated values
    predictions = pd.DataFrame({
        'mean trip duration': mean_duration,
        'margin of error': margin_of_error
    })
    predictions['margin of error'].fillna(0.0, inplace=True)
    return predictions

In [407]:
def generate_predictions():
    
    df = pd.read_csv("https://data.cityofnewyork.us/resource/4p5c-cbgn.csv")
    
    df = compute_trip_duration(df)
    
    df = add_hour_and_day(df)
    
    df = predictions(df)
    return df

In [414]:
generate_predictions().head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean trip duration,margin of error
pulocationid,dolocationid,day of week,hour,Unnamed: 4_level_1,Unnamed: 5_level_1
3,265,Thursday,19,990.0,0.0
4,48,Thursday,19,2188.0,0.0
4,87,Thursday,19,746.0,0.0
4,114,Thursday,19,822.0,0.0
4,193,Thursday,19,1353.0,0.0
4,209,Thursday,19,649.0,0.0
4,249,Thursday,19,996.666667,139.437902
7,95,Thursday,19,1815.0,0.0
7,145,Thursday,19,516.0,0.0
7,223,Thursday,19,746.0,144.897266
