In [1]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
all_trips = pd.read_csv("uber_tripdata_2021_labeled_days_with_dates.csv", compression="gzip")

In [3]:
print("Number of rows in dataframe:", len(all_trips)) # should be 121,645,919

Number of rows in dataframe: 121645919


In [4]:
all_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_day,pickup_hour,PU_type_of_day
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,4,0,weekday
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,4,0,weekday
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,4,0,weekday
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,4,0,weekday
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,4,0,weekday


In [5]:
# drop pickup_day (and dropoff_day) columns since we only want to compare weekday vs weekend
all_trips.drop(columns=["pickup_day"], inplace=True)

In [6]:
all_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour,PU_type_of_day
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,0,weekday
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,0,weekday
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,0,weekday
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,0,weekday
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,0,weekday


In [7]:
weekday_trips = all_trips.loc[all_trips["PU_type_of_day"]=="weekday"]
weekend_trips = all_trips.loc[all_trips["PU_type_of_day"]=="weekend"]

In [8]:
weekday_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour,PU_type_of_day
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,0,weekday
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,0,weekday
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,0,weekday
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,0,weekday
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,0,weekday


In [9]:
weekend_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour,PU_type_of_day
283708,244,244,Manhattan,Washington Heights South,Manhattan,Washington Heights South,2021-01-02,1,0,weekend
283709,87,152,Manhattan,Financial District North,Manhattan,Manhattanville,2021-01-02,1,0,weekend
283710,203,197,Queens,Rosedale,Queens,Richmond Hill,2021-01-02,1,0,weekend
283711,220,247,Bronx,Spuyten Duyvil/Kingsbridge,Bronx,West Concourse,2021-01-02,1,0,weekend
283712,247,247,Bronx,West Concourse,Bronx,West Concourse,2021-01-02,1,0,weekend


In [10]:
overall_num_of_weekday_dates = weekday_trips["pickup_date"].nunique()
overall_num_of_weekend_dates = weekend_trips["pickup_date"].nunique()

In [11]:
print("Number of unique dates that are weekdays:", overall_num_of_weekday_dates)
print("Number of unique dates that are weekends:", overall_num_of_weekend_dates)

Number of unique dates that are weekdays: 261
Number of unique dates that are weekends: 104


In [12]:
def get_trip_counts_by_hour(df, col_to_group_by, value_of_interest, num_of_weekday_dates, num_of_weekend_dates):
    # df: all_trips
    # col_to_group_by: a string (the column name to group by)
    # value_of_interest: a string (the value in the column col_to_group_by of interest)
    # e.g. if want to find trip counts by hour for borough Manhattan, call:
    #      trip_counts_by_hour(all_trips, "pickup_borough", "Manhattan")

    # if no given column to group by or value of interest (e.g. want to find trip counts in new york as a whole)
    if col_to_group_by == "" and value_of_interest == "":
        trip_counts = df.groupby(["PU_type_of_day", "pickup_hour"]).size().reset_index(name="trip_count")
        weekday_trip_counts = trip_counts.loc[trip_counts["PU_type_of_day"]=="weekday"].drop(columns=["PU_type_of_day"])
        weekday_trip_counts["normalized_trip_counts"] = weekday_trip_counts["trip_count"] / num_of_weekday_dates
        weekend_trip_counts = trip_counts.loc[trip_counts["PU_type_of_day"]=="weekend"].drop(columns=["PU_type_of_day"])
        weekend_trip_counts["normalized_trip_counts"] = weekend_trip_counts["trip_count"] / num_of_weekend_dates

    # if no value of interest but have column to group by (e.g. want to group by months but not select any specific month)
    elif value_of_interest == "" and col_to_group_by != "":
        trip_counts = df.groupby([col_to_group_by, "PU_type_of_day", "pickup_hour"]).size().reset_index(name="trip_count")
        weekday_trip_counts = trip_counts.loc[trip_counts["PU_type_of_day"]=="weekday"].drop(columns=["PU_type_of_day"])
        weekday_trip_counts["normalized_trip_counts"] = weekday_trip_counts["trip_count"] / num_of_weekday_dates
        weekend_trip_counts = trip_counts.loc[trip_counts["PU_type_of_day"]=="weekend"].drop(columns=["PU_type_of_day"])
        weekend_trip_counts["normalized_trip_counts"] = weekend_trip_counts["trip_count"] / num_of_weekend_dates

    # if have column and have (e.g. want to group by months and select specific month)
    else:
        trip_counts = df.groupby([col_to_group_by, "PU_type_of_day", "pickup_hour"]).size().reset_index(name="trip_count")
        weekday_trip_counts = trip_counts.loc[trip_counts["PU_type_of_day"]=="weekday"].drop(columns=["PU_type_of_day"])
        weekday_trip_counts["normalized_trip_counts"] = weekday_trip_counts["trip_count"] / num_of_weekday_dates
        weekend_trip_counts = trip_counts.loc[trip_counts["PU_type_of_day"]=="weekend"].drop(columns=["PU_type_of_day"])
        weekend_trip_counts["normalized_trip_counts"] = weekend_trip_counts["trip_count"] / num_of_weekend_dates
        weekday_trip_counts = weekday_trip_counts.loc[weekday_trip_counts[col_to_group_by]==value_of_interest]
        weekend_trip_counts = weekend_trip_counts.loc[weekend_trip_counts[col_to_group_by]==value_of_interest]
    return weekday_trip_counts, weekend_trip_counts

In [13]:
def plot_trip_count_against_hour(df1, df2, trip_counts_location):
    # df1: weekday_trip_counts
    # df2: weekend_trip_counts
    # trip_counts_location: a string (to input in title of plot)
    
    # plot weekday and weekend trip counts against hour
    plt.plot(df1["pickup_hour"], df1["normalized_trip_counts"], label="Weekday", color="red")
    plt.plot(df2["pickup_hour"], df2["normalized_trip_counts"], label="Weekend", color="blue", linestyle="dashed")
    x_ticks_list = list(set(list(df1["pickup_hour"])+list(df2["pickup_hour"]))) # label x-axis with values in hour
    plt.xticks(x_ticks_list)
    plt.grid(axis="x")
    plt.xlabel("Hour")
    plt.ylabel("Normalized trip count")
    plt.title("Trip Rate in %s in 2021 by Hour" % trip_counts_location)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    # save figure in a png file
    file_name = "trip_counts_" + trip_counts_location.replace(" ", "") + ".png"
    plt.savefig(file_name, bbox_inches="tight")
    plt.close()

In [14]:
def plot_trip_count_on_same_plot(df, col_to_group_by, colors, list_of_labels, title, filename):
    # df: a dataframe with all values that you want to create different lines for
    # col_to_group_by: a string, the column name of column with values for which you want to create different lines
    # colors: a list of colors of lines to plot (same length as number of unique values of col_to_group_by)
    # list_of_labels: a list of labels for plot legend
    # title: a string for plot title
    # filename: a string for filename when saving plot as .png file

    fig, axes = plt.subplots()
    
    linestyles = ["solid", "dashed"]
    
    for a_index, a in enumerate(np.unique(df[col_to_group_by])):
        print("a:", a)
        # print("a index:", a_index)
        for day_type_index, day_type in enumerate(np.unique(df["PU_type_of_day"])):
            temp_1 = df[df["PU_type_of_day"] == day_type]
            temp = df[df[col_to_group_by] == a]
            temp = temp[temp["PU_type_of_day"] == day_type]
            num_of_daytype_dates = temp["pickup_date"].nunique()
            # print("Num of daytype dates:", num_of_daytype_dates)
            temp = temp.groupby(["pickup_hour"])["trip_count"].sum().reset_index(name="trip_count")
            temp["normalized_trip_counts"] = temp["trip_count"] / num_of_daytype_dates
            axes.plot(temp["pickup_hour"], temp["normalized_trip_counts"], c = colors[a_index], ls = linestyles[day_type_index])
    
    #dummy lines with NO entries, just to create the black style legend
    dummy_lines = []
    for day_type_index, day_type in enumerate(np.unique(df["PU_type_of_day"])):
        dummy_lines.append(axes.plot([],[], c="black", ls = linestyles[day_type_index])[0])

    list_len = len(list_of_labels)
    list_end = 2*list_len
    list_of_i = list(range(0, list_end, 2))
    lines = axes.get_lines()
    legend1 = plt.legend([lines[i] for i in list_of_i], list_of_labels, loc='center left', bbox_to_anchor=(1, 0.4))
    legend2 = plt.legend([dummy_lines[i] for i in [0,1]], ["Weekday", "Weekend"], loc='center left', bbox_to_anchor=(1, 0.9))
    axes.add_artist(legend1)
    
    x_ticks_list = list(set(list(df["pickup_hour"]))) # label x-axis with values in hour
    plt.xticks(x_ticks_list)
    plt.grid(axis="x")
    plt.xlabel("Hour")
    plt.ylabel("Normalized trip count")
    plt.title(title)
    
    # save figure in a png file
    filename = filename + ".png"
    plt.savefig(filename, bbox_inches="tight")
    plt.close()

Trip Counts by Month

In [15]:
all_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour,PU_type_of_day
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,0,weekday
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,0,weekday
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,0,weekday
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,0,weekday
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,0,weekday


In [16]:
months_trip_counts = all_trips.groupby(["pickup_month", "PU_type_of_day", "pickup_hour", "pickup_date"]).size().reset_index(name="trip_count")

In [17]:
months_trip_counts.head()

Unnamed: 0,pickup_month,PU_type_of_day,pickup_hour,pickup_date,trip_count
0,1,weekday,0,2021-01-01,21412
1,1,weekday,0,2021-01-04,5155
2,1,weekday,0,2021-01-05,4849
3,1,weekday,0,2021-01-06,5443
4,1,weekday,0,2021-01-07,5696


In [18]:
month_colors = ["powderblue", "salmon", "violet", "yellowgreen", "pink", "navajowhite", "chocolate", "maroon", "navy", "darkorchid", "goldenrod", "teal"]
month_labels = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
month_plot_title = "Monthly Trip Rate in New York City in 2021 by Hour"
month_filename = "monthly_trip_counts"
plot_trip_count_on_same_plot(months_trip_counts, "pickup_month", month_colors, month_labels, month_plot_title, month_filename)

a: 1
a: 2
a: 3
a: 4
a: 5
a: 6
a: 7
a: 8
a: 9
a: 10
a: 11
a: 12


Trip Counts by Seasons
- Spring: March, April, May
- Summer: June, July, August
- Fall: September, October, November
- Winter: December, January, February

In [19]:
months_trip_counts.head()

Unnamed: 0,pickup_month,PU_type_of_day,pickup_hour,pickup_date,trip_count
0,1,weekday,0,2021-01-01,21412
1,1,weekday,0,2021-01-04,5155
2,1,weekday,0,2021-01-05,4849
3,1,weekday,0,2021-01-06,5443
4,1,weekday,0,2021-01-07,5696


In [20]:
# label seasons
# referred to https://stackoverflow.com/a/57410089
months_trip_counts.loc[months_trip_counts["pickup_month"].isin([3,4,5]), "season"] = "Spring"
months_trip_counts.loc[months_trip_counts["pickup_month"].isin([6,7,8]), "season"] = "Summer"
months_trip_counts.loc[months_trip_counts["pickup_month"].isin([9,10,11]), "season"] = "Fall"
months_trip_counts.loc[months_trip_counts["pickup_month"].isin([12,1,2]), "season"] = "Winter"

In [21]:
months_trip_counts.head()

Unnamed: 0,pickup_month,PU_type_of_day,pickup_hour,pickup_date,trip_count,season
0,1,weekday,0,2021-01-01,21412,Winter
1,1,weekday,0,2021-01-04,5155,Winter
2,1,weekday,0,2021-01-05,4849,Winter
3,1,weekday,0,2021-01-06,5443,Winter
4,1,weekday,0,2021-01-07,5696,Winter


In [22]:
seasons_trip_counts = months_trip_counts.groupby(["season", "PU_type_of_day", "pickup_hour", "pickup_date"])["trip_count"].sum().reset_index(name="trip_count")

In [23]:
seasons_trip_counts.head()

Unnamed: 0,season,PU_type_of_day,pickup_hour,pickup_date,trip_count
0,Fall,weekday,0,2021-09-01,8587
1,Fall,weekday,0,2021-09-02,9222
2,Fall,weekday,0,2021-09-03,12863
3,Fall,weekday,0,2021-09-06,14752
4,Fall,weekday,0,2021-09-07,7774


In [24]:
seasons_colors = ["darkred", "green", "goldenrod", "mediumblue"]
seasons_labels = ["Fall", "Spring", "Summer", "Winter"]
seasons_plot_title = "Seasonal Trip Rate in New York City in 2021 by Hour"
seasons_filename = "seasonal_trip_counts"
plot_trip_count_on_same_plot(seasons_trip_counts, "season", seasons_colors, seasons_labels, seasons_plot_title, seasons_filename)

a: Fall
a: Spring
a: Summer
a: Winter


Trip Counts by Borough

In [25]:
boroughs_trip_counts = all_trips.groupby(["pickup_borough", "PU_type_of_day", "pickup_hour", "pickup_date"]).size().reset_index(name="trip_count")

In [26]:
boroughs_trip_counts.head()

Unnamed: 0,pickup_borough,PU_type_of_day,pickup_hour,pickup_date,trip_count
0,Bronx,weekday,0,2021-01-01,5001
1,Bronx,weekday,0,2021-01-04,1209
2,Bronx,weekday,0,2021-01-05,1122
3,Bronx,weekday,0,2021-01-06,1260
4,Bronx,weekday,0,2021-01-07,1325


In [27]:
borough_colors = ["maroon", "goldenrod", "yellowgreen", "mediumblue", "magenta", "darkorange"]
borough_labels = ["Bronx", "Brooklyn", "EWR", "Manhattan", "Queens", "Staten Island"]
borough_plot_title = "Trip Rate in Boroughs in New York City in 2021 by Hour"
borough_filename = "borough_trip_counts"
plot_trip_count_on_same_plot(boroughs_trip_counts, "pickup_borough", borough_colors, borough_labels, borough_plot_title, borough_filename)

a: Bronx
a: Brooklyn
a: EWR
a: Manhattan
a: Queens
a: Staten Island


Trip Counts in New York

In [28]:
# Find and plot trip counts in New York overall
weekday_trip_counts_ny, weekend_trip_counts_ny = get_trip_counts_by_hour(all_trips, "", "", overall_num_of_weekday_dates, overall_num_of_weekend_dates)

In [29]:
weekday_trip_counts_ny.head()

Unnamed: 0,pickup_hour,trip_count,normalized_trip_counts
0,0,2323172,8901.042146
1,1,1445472,5538.206897
2,2,982311,3763.643678
3,3,738776,2830.559387
4,4,811834,3110.475096


In [30]:
weekend_trip_counts_ny.head()

Unnamed: 0,pickup_hour,trip_count,normalized_trip_counts
24,0,1962923,18874.259615
25,1,1604033,15423.394231
26,2,1205687,11593.144231
27,3,891020,8567.5
28,4,663799,6382.682692


In [31]:
plot_trip_count_against_hour(weekday_trip_counts_ny, weekend_trip_counts_ny, "New York City")

Focusing on weekday peak hours 8AM and 6PM

In [32]:
# Based on NY trip count plot, peak hours on weekdays are hours 8 (8AM) and 18 (6PM)
# So, focus on weekday trips with pickups at either 8AM or 6PM
all_trips_peak_hours = all_trips.loc[(all_trips["PU_type_of_day"]=="weekday") & ((all_trips["pickup_hour"]==8) | (all_trips["pickup_hour"]==18))]

In [33]:
all_trips_peak_hours.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour,PU_type_of_day
119460,188,89,Brooklyn,Prospect-Lefferts Gardens,Brooklyn,Flatbush/Ditmas Park,2021-01-01,1,8,weekday
119461,89,26,Brooklyn,Flatbush/Ditmas Park,Brooklyn,Borough Park,2021-01-01,1,8,weekday
119462,219,197,Queens,Springfield Gardens South,Queens,Richmond Hill,2021-01-01,1,8,weekday
119463,197,180,Queens,Richmond Hill,Queens,Ozone Park,2021-01-01,1,8,weekday
119464,180,82,Queens,Ozone Park,Queens,Elmhurst,2021-01-01,1,8,weekday


In [34]:
# drop irrelevant columns
all_trips_peak_hours.drop(columns=["PU_type_of_day"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_trips_peak_hours.drop(columns=["PU_type_of_day"], inplace=True)


In [35]:
all_trips_peak_hours.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour
119460,188,89,Brooklyn,Prospect-Lefferts Gardens,Brooklyn,Flatbush/Ditmas Park,2021-01-01,1,8
119461,89,26,Brooklyn,Flatbush/Ditmas Park,Brooklyn,Borough Park,2021-01-01,1,8
119462,219,197,Queens,Springfield Gardens South,Queens,Richmond Hill,2021-01-01,1,8
119463,197,180,Queens,Richmond Hill,Queens,Ozone Park,2021-01-01,1,8
119464,180,82,Queens,Ozone Park,Queens,Elmhurst,2021-01-01,1,8


In [36]:
trips_morning_peak = all_trips_peak_hours.loc[all_trips_peak_hours["pickup_hour"]==8]
trips_evening_peak = all_trips_peak_hours.loc[all_trips_peak_hours["pickup_hour"]==18]

In [37]:
trips_morning_peak.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour
119460,188,89,Brooklyn,Prospect-Lefferts Gardens,Brooklyn,Flatbush/Ditmas Park,2021-01-01,1,8
119461,89,26,Brooklyn,Flatbush/Ditmas Park,Brooklyn,Borough Park,2021-01-01,1,8
119462,219,197,Queens,Springfield Gardens South,Queens,Richmond Hill,2021-01-01,1,8
119463,197,180,Queens,Richmond Hill,Queens,Ozone Park,2021-01-01,1,8
119464,180,82,Queens,Ozone Park,Queens,Elmhurst,2021-01-01,1,8


In [38]:
trips_evening_peak.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_hour
213149,125,148,Manhattan,Hudson Sq,Manhattan,Lower East Side,2021-01-01,1,18
213150,79,90,Manhattan,East Village,Manhattan,Flatiron,2021-01-01,1,18
213151,90,170,Manhattan,Flatiron,Manhattan,Murray Hill,2021-01-01,1,18
213152,151,171,Manhattan,Manhattan Valley,Queens,Murray Hill-Queens,2021-01-01,1,18
213153,252,192,Queens,Whitestone,Queens,Queensboro Hill,2021-01-01,1,18


Find frequency and reverse frequency of trips from A to B during morning and evening peak hours

In [39]:
def get_frequency(df):
    # df: dataframe for which we want to find frequency and reverse frequency for

    # From Maya's notebook, edited a bit
    # Calculates frequency of rides from Point A to Point B
    frequency_of_rides = df.groupby(["PULocationID", "DOLocationID"]).size().reset_index(name="frequency")
    
    # Calculates frequency of rides from Point B to Point A (Denoted by Reverse Frequency)
    reverse_freq_of_rides = frequency_of_rides.rename(columns={"PULocationID": "DOLocationID", "DOLocationID": "PULocationID", "frequency": "reverse_freq"})

    merged_counts = pd.merge(frequency_of_rides, reverse_freq_of_rides, how='left', on=["PULocationID", "DOLocationID"])
    merged_counts["reverse_freq"] = merged_counts["reverse_freq"].fillna(0).astype(int) # replace NaNs with 0 for pairs with no reverse rides

    def sort(row):
        sorted_ids = sorted([str(row["PULocationID"]), str(row["DOLocationID"])])
        return '_'.join(sorted_ids)

    merged_counts["SortedCombination"] = merged_counts.apply(sort, axis=1)

    #We drop duplicate rows based on the sorted combination column (avoid double counting in df)
    merged_counts = merged_counts.drop_duplicates("SortedCombination")

    #Remove the SC column if you don't need it anymore
    merged_counts = merged_counts.drop(columns=["SortedCombination"])

    # Find difference in frequency and reverse frequency
    merged_counts["frequency-minus-reverse_freq"] = merged_counts["frequency"] - merged_counts["reverse_freq"]

    return merged_counts

In [40]:
trips_morning_peak_freq = get_frequency(trips_morning_peak)
trips_evening_peak_freq = get_frequency(trips_evening_peak)

In [41]:
trips_morning_peak_freq.head()

Unnamed: 0,PULocationID,DOLocationID,frequency,reverse_freq,frequency-minus-reverse_freq
0,2,76,1,0,1
1,2,77,1,0,1
2,2,124,2,0,2
3,2,180,1,0,1
4,2,216,2,1,1


In [42]:
trips_evening_peak_freq.head()

Unnamed: 0,PULocationID,DOLocationID,frequency,reverse_freq,frequency-minus-reverse_freq
0,2,117,1,1,0
1,2,124,3,1,2
2,2,130,1,0,1
3,2,145,1,0,1
4,2,180,2,2,0


In [43]:
# save dataframes as csv files
# compression_opts = dict(method="gzip")
# trips_morning_peak_freq.to_csv("./uber_tripdata_2021_weekday_morning_peak_frequency.csv", index=False, compression=compression_opts)
# trips_evening_peak_freq.to_csv("./uber_tripdata_2021_weekday_evening_peak_frequency.csv", index=False, compression=compression_opts)

Find in-degrees, out-degrees, ratio of in- to out-degrees, ratio of out- to in-degrees of each zone during morning and evening peak hours

In [44]:
def get_degrees_df(df):
    # df: a dataframe for which we want to calculate zone out-degrees and in-degrees for
    in_degrees = df.groupby(["DOLocationID"]).size().reset_index(name="in_degree")
    out_degrees = df.groupby(["PULocationID"]).size().reset_index(name="out_degree")

    # merge to get in-degree and out-degree in same dataframe
    degrees = pd.merge(in_degrees, out_degrees, how="left", left_on="DOLocationID", right_on="PULocationID")

    # drop "pickup_zone" column and rename "dropoff_zone" column to "zone"
    degrees.drop(columns=["PULocationID"], inplace=True)
    degrees.rename(columns={"DOLocationID": "LocationID"}, inplace=True)

    # replace NaN with 0 (there was a NaN value for Newark Airport's out-degree)
    degrees = degrees.fillna(0)

    # make sure degrees are integers
    degrees["in_degree"] = degrees["in_degree"].astype(int)
    degrees["out_degree"] = degrees["out_degree"].astype(int)

    degrees["ratio_of_in_to_out_deg"] = degrees["in_degree"] / degrees["out_degree"]
    degrees["ratio_of_out_to_in_deg"] = degrees["out_degree"] / degrees["in_degree"]
    
    return degrees

In [45]:
trips_morning_peak_degrees = get_degrees_df(trips_morning_peak)
trips_evening_peak_degrees = get_degrees_df(trips_evening_peak)

In [46]:
trips_morning_peak_degrees.head()

Unnamed: 0,LocationID,in_degree,out_degree,ratio_of_in_to_out_deg,ratio_of_out_to_in_deg
0,1,25855,0,inf,0.0
1,2,17,8,2.125,0.470588
2,3,11670,12890,0.905353,1.104542
3,4,5216,15240,0.342257,2.921779
4,5,778,2020,0.385149,2.596401


In [47]:
# save dataframes as csv files
# compression_opts = dict(method="gzip")
# trips_morning_peak_degrees.to_csv("./uber_tripdata_2021_weekday_morning_peak_degrees.csv", index=False, compression=compression_opts)
# trips_evening_peak_degrees.to_csv("./uber_tripdata_2021_weekday_evening_peak_degrees.csv", index=False, compression=compression_opts)