# Data Preparation: UberDrivers Jan - June 2015


In [36]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import hvplot.pandas
import warnings
warnings.filterwarnings("ignore")

In [37]:
# Files to Load

uber_trip_2015 = "uber_resources/uber_trip_2015/uber-raw-data-janjune-15.csv"

raw_data = pd.read_csv(uber_trip_2015)

raw_data.head()

Unnamed: 0,Dispatching_base_num,Pickup_date,Affiliated_base_num,locationID
0,B02617,2015-05-17 09:47:00,B02617,141
1,B02617,2015-05-17 09:47:00,B02617,65
2,B02617,2015-05-17 09:47:00,B02617,100
3,B02617,2015-05-17 09:47:00,B02774,80
4,B02617,2015-05-17 09:47:00,B02617,90


# Data Cleaning

In [38]:
# Data Cleaning
# Splitting Pickup_date into their own columns Date, Time
raw_data[['Date', 'Time']] = raw_data['Pickup_date'].str.split(expand=True)

raw_data.drop(columns=['Pickup_date'], inplace=True)

raw_data["Time"]
del raw_data["Affiliated_base_num"]
del raw_data["locationID"]

raw_data.head()

Unnamed: 0,Dispatching_base_num,Date,Time
0,B02617,2015-05-17,09:47:00
1,B02617,2015-05-17,09:47:00
2,B02617,2015-05-17,09:47:00
3,B02617,2015-05-17,09:47:00
4,B02617,2015-05-17,09:47:00


In [39]:
# Handling data value types:
#print(raw_data.info())

#print("After data converstion")

data = {'Date': raw_data["Date"]}
datetime = pd.DataFrame(data)

# Convert date columns to datetime dtype
raw_data['Date'] = pd.to_datetime(datetime['Date'])

#print(raw_data.info())

In [40]:
# Finding Duplicates 
print(raw_data.count())

print("After removing duplicates:")
non_dupe_data = raw_data.drop_duplicates()
print(non_dupe_data.count())

Dispatching_base_num    14270479
Date                    14270479
Time                    14270479
dtype: int64
After removing duplicates:
Dispatching_base_num    4525316
Date                    4525316
Time                    4525316
dtype: int64


In [41]:
# Handling Missing Values
non_dupe_data = non_dupe_data.dropna()

non_na_data = non_dupe_data.dropna(axis=1)

print(non_na_data.count())

Dispatching_base_num    4525316
Date                    4525316
Time                    4525316
dtype: int64


In [42]:
# Renaming to finalized dataframe
uber_data = non_na_data

uber_data.rename(columns={'Dispatching_base_num': 'Base'}, inplace=True)

uber_data.head()

Unnamed: 0,Base,Date,Time
0,B02617,2015-05-17,09:47:00
10,B02617,2015-05-17,09:48:00
12,B02617,2015-05-17,09:49:00
21,B02617,2015-05-17,09:50:00
29,B02617,2015-05-17,09:51:00


# Trips Data

In [23]:
# All data per base
per_base_counts = uber_data["Base"].value_counts()
per_base_counts

B02764    5013875
B02682    3057302
B02617    1943107
B02598    1441905
B02765    1097063
B02512     248444
B02835      26421
B02836       1990
Name: Base, dtype: int64

In [24]:
# Finding the Number of trips per day by a base
rides_per_day = uber_data.groupby("Base")["Date"].value_counts()

rides_per_day_df = rides_per_day.reset_index(name="Number of trips")

rides_per_day_df.head()

Unnamed: 0,Base,Date,Number of trips
0,B02512,2015-02-13,2500
1,B02512,2015-02-20,2478
2,B02512,2015-02-19,2171
3,B02512,2015-02-21,2144
4,B02512,2015-02-12,2138


# Uber Summary

In [25]:
# All base names
bases = uber_data["Base"].unique()
bases

array(['B02617', 'B02598', 'B02682', 'B02764', 'B02512', 'B02765',
       'B02835', 'B02836'], dtype=object)

In [26]:
# Total bases
base_count = uber_data["Base"].nunique()
base_count

8

In [27]:
# total trips 
num_trips_per_base = rides_per_day_df.groupby("Base")["Number of trips"].sum()
num_trips_per_base

Base
B02512     248444
B02598    1441905
B02617    1943107
B02682    3057302
B02764    5013875
B02765    1097063
B02835      26421
B02836       1990
Name: Number of trips, dtype: int64

In [28]:
# maximum trips
max_trips_per_base = rides_per_day_df.groupby("Base")["Number of trips"].max()
max_trips_per_base

Base
B02512     2500
B02598    12559
B02617    17088
B02682    37329
B02764    46134
B02765    11235
B02835     2470
B02836      103
Name: Number of trips, dtype: int64

In [29]:
# minimum trips
min_trips_per_base = rides_per_day_df.groupby("Base")["Number of trips"].min()
min_trips_per_base

Base
B02512      377
B02598     2860
B02617     4322
B02682     4454
B02764    12271
B02765      937
B02835      300
B02836       40
Name: Number of trips, dtype: int64

In [30]:
# Date Range
start_date = rides_per_day_df["Date"].min()
end_date = rides_per_day_df["Date"].max()

In [31]:
# Total Days
total_days = len(rides_per_day_df["Date"].unique())

In [32]:
# Average Trips per Day
avg_trips_per_day = num_trips_per_base / total_days
avg_trips_per_day

Base
B02512     1372.618785
B02598     7966.325967
B02617    10735.397790
B02682    16891.171271
B02764    27700.966851
B02765     6061.121547
B02835      145.972376
B02836       10.994475
Name: Number of trips, dtype: float64

In [33]:
# Most Active Day
most_active_dates_idx = rides_per_day_df.groupby('Base')['Number of trips'].idxmax()
most_active_dates = rides_per_day_df.loc[most_active_dates_idx, 'Date']

# Set the index to "Base" and rename the index
most_active_dates.index = most_active_dates_idx.index
most_active_dates.index.name = 'Base'

# Least Active Day
least_active_dates_idx = rides_per_day_df.groupby('Base')['Number of trips'].idxmin()
least_active_dates = rides_per_day_df.loc[least_active_dates_idx, 'Date']

# Set the index to "Base"
least_active_dates.index = least_active_dates_idx.index
least_active_dates.index.name = 'Base'


print("Most Active Days:"),
print(most_active_dates)

print("\nLeast Active Days:")
print(least_active_dates)

Most Active Days:
Base
B02512   2015-02-13
B02598   2015-02-13
B02617   2015-02-14
B02682   2015-06-27
B02764   2015-02-14
B02765   2015-06-27
B02835   2015-06-27
B02836   2015-06-27
Name: Date, dtype: datetime64[ns]

Least Active Days:
Base
B02512   2015-04-05
B02598   2015-01-27
B02617   2015-01-27
B02682   2015-01-27
B02764   2015-01-27
B02765   2015-01-27
B02835   2015-06-08
B02836   2015-06-03
Name: Date, dtype: datetime64[ns]


In [34]:
uber_summary = pd.DataFrame({
    "Base": bases,
    "Total Trips": num_trips_per_base,
    "Maximum Trips": max_trips_per_base,
    "Minimum Trips": min_trips_per_base,
    "Start Date": start_date,
    "End Date": end_date,
    "Total Days": total_days,
    "Average Trips/Day": avg_trips_per_day,
    "Most Active Day": most_active_dates,
    "Least Active Day": least_active_dates   
})

#uber_summary = uber_summary.set_index("Base")

uber_summary

Unnamed: 0_level_0,Base,Total Trips,Maximum Trips,Minimum Trips,Start Date,End Date,Total Days,Average Trips/Day,Most Active Day,Least Active Day
Base,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B02512,B02617,248444,2500,377,2015-01-01,2015-06-30,181,1372.618785,2015-02-13,2015-04-05
B02598,B02598,1441905,12559,2860,2015-01-01,2015-06-30,181,7966.325967,2015-02-13,2015-01-27
B02617,B02682,1943107,17088,4322,2015-01-01,2015-06-30,181,10735.39779,2015-02-14,2015-01-27
B02682,B02764,3057302,37329,4454,2015-01-01,2015-06-30,181,16891.171271,2015-06-27,2015-01-27
B02764,B02512,5013875,46134,12271,2015-01-01,2015-06-30,181,27700.966851,2015-02-14,2015-01-27
B02765,B02765,1097063,11235,937,2015-01-01,2015-06-30,181,6061.121547,2015-06-27,2015-01-27
B02835,B02835,26421,2470,300,2015-01-01,2015-06-30,181,145.972376,2015-06-27,2015-06-08
B02836,B02836,1990,103,40,2015-01-01,2015-06-30,181,10.994475,2015-06-27,2015-06-03


# Output Results

In [43]:
#dataframe into csv 
uber_data.to_csv('Data/uber_data_janjune_2015.csv', index=False)

#dataframe into csv 
rides_per_day_df.to_csv('Data/rides_per_day_janjune_2015.csv', index=False)

uber_summary.to_csv('Data/uber_summary_janjune_2015.csv', index=False)