# Data Preparation: UberDrivers 2014

In [23]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import hvplot.pandas
import warnings
warnings.filterwarnings("ignore")

In [26]:
# Files to Load

uber_trip_2014 = [
    "uber_resources/uber_trip_2014/uber-raw-data-apr14.csv",
    "uber_resources/uber_trip_2014/uber-raw-data-aug14.csv",
    "uber_resources/uber_trip_2014/uber-raw-data-jul14.csv",
    "uber_resources/uber_trip_2014/uber-raw-data-jun14.csv",
    "uber_resources/uber_trip_2014/uber-raw-data-may14.csv",
    "uber_resources/uber_trip_2014/uber-raw-data-sep14.csv"
]

# Initialize an empty list to store DataFrames
data_frames = []

# Read each CSV file and append its DataFrame to the list
for file_path in uber_trip_2014:
    df = pd.read_csv(file_path)
    data_frames.append(df)

# Concatenate all DataFrames in the list into one DataFrame
raw_data = pd.concat(data_frames, ignore_index=True)
raw_data

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512
...,...,...,...,...
4534322,9/30/2014 22:57:00,40.7668,-73.9845,B02764
4534323,9/30/2014 22:57:00,40.6911,-74.1773,B02764
4534324,9/30/2014 22:58:00,40.8519,-73.9319,B02764
4534325,9/30/2014 22:58:00,40.7081,-74.0066,B02764


# Data cleaning

In [27]:
# Data Cleaning
# Splitting Date/Time into their own columns
raw_data[['Date', 'Time']] = raw_data['Date/Time'].str.split(expand=True)

raw_data.drop(columns=['Date/Time'], inplace=True)

raw_data["Time"]
del raw_data["Lat"]
del raw_data["Lon"]

raw_data.head()

Unnamed: 0,Base,Date,Time
0,B02512,4/1/2014,0:11:00
1,B02512,4/1/2014,0:17:00
2,B02512,4/1/2014,0:21:00
3,B02512,4/1/2014,0:28:00
4,B02512,4/1/2014,0:33:00


In [28]:
# Handling data value types:
#print(raw_data.info())

#print("After data converstion")

data = {'Date': raw_data["Date"]}
datetime = pd.DataFrame(data)

# Convert date columns to datetime dtype
raw_data['Date'] = pd.to_datetime(datetime['Date'])

#print(raw_data.info())

In [29]:
# Finding Duplicates 
print(raw_data.count())

print("After removing duplicates:")
non_dupe_data = raw_data.drop_duplicates()
print(non_dupe_data.count())

Base    4534327
Date    4534327
Time    4534327
dtype: int64
After removing duplicates:
Base    937192
Date    937192
Time    937192
dtype: int64


In [30]:
# Handling Missing Values
non_dupe_data = non_dupe_data.dropna()

non_na_data = non_dupe_data.dropna(axis=1)

print(non_na_data.count())

Base    937192
Date    937192
Time    937192
dtype: int64


In [31]:
# Renaming to finalized dataframe
uber_data = non_na_data
uber_data.head()

Unnamed: 0,Base,Date,Time
0,B02512,2014-04-01,0:11:00
1,B02512,2014-04-01,0:17:00
2,B02512,2014-04-01,0:21:00
3,B02512,2014-04-01,0:28:00
4,B02512,2014-04-01,0:33:00


# Trips Data

In [9]:
# All data per base
per_base_counts = uber_data["Base"].value_counts()
per_base_counts

B02617    1417983
B02598    1379578
B02682    1198901
B02764     254931
B02512     200353
Name: Base, dtype: int64

In [10]:
# Finding the Number of trips per day by a base
rides_per_day = uber_data.groupby("Base")["Date"].value_counts()

rides_per_day_df = rides_per_day.reset_index(name="Number of trips")

rides_per_day_df.head()

Unnamed: 0,Base,Date,Number of trips
0,B02512,2014-04-30,2107
1,B02512,2014-04-04,1768
2,B02512,2014-09-05,1752
3,B02512,2014-05-16,1686
4,B02512,2014-07-15,1618


# Uber Summary

In [11]:
# All base names
bases = uber_data["Base"].unique()
bases

array(['B02512', 'B02598', 'B02617', 'B02682', 'B02764'], dtype=object)

In [12]:
# Total bases
base_count = uber_data["Base"].nunique()
base_count

5

In [13]:
# total trips 
num_trips_per_base = rides_per_day_df.groupby("Base")["Number of trips"].sum()
num_trips_per_base

Base
B02512     200353
B02598    1379578
B02617    1417983
B02682    1198901
B02764     254931
Name: Number of trips, dtype: int64

In [14]:
# maximum trips
max_trips_per_base = rides_per_day_df.groupby("Base")["Number of trips"].max()
max_trips_per_base

Base
B02512     2107
B02598    13383
B02617    16037
B02682    13151
B02764     9229
Name: Number of trips, dtype: int64

In [15]:
# minimum trips
min_trips_per_base = rides_per_day_df.groupby("Base")["Number of trips"].min()
min_trips_per_base

Base
B02512     391
B02598    3509
B02617    1934
B02682    2867
B02764     114
Name: Number of trips, dtype: int64

In [16]:
# Date Range
start_date = rides_per_day_df["Date"].min()
end_date = rides_per_day_df["Date"].max()

In [17]:
# Total Days
total_days = len(rides_per_day_df["Date"].unique())

In [18]:
# Average Trips per Day
avg_trips_per_day = num_trips_per_base / total_days
avg_trips_per_day

Base
B02512    1094.825137
B02598    7538.677596
B02617    7748.540984
B02682    6551.371585
B02764    1393.065574
Name: Number of trips, dtype: float64

In [19]:
# Most Active Day
most_active_dates_idx = rides_per_day_df.groupby('Base')['Number of trips'].idxmax()
most_active_dates = rides_per_day_df.loc[most_active_dates_idx, 'Date']

# Set the index to "Base" and rename the index
most_active_dates.index = most_active_dates_idx.index
most_active_dates.index.name = 'Base'

# Least Active Day
least_active_dates_idx = rides_per_day_df.groupby('Base')['Number of trips'].idxmin()
least_active_dates = rides_per_day_df.loc[least_active_dates_idx, 'Date']

# Set the index to "Base"
least_active_dates.index = least_active_dates_idx.index
least_active_dates.index.name = 'Base'


print("Most Active Days:"),
print(most_active_dates)

print("\nLeast Active Days:")
print(least_active_dates)

Most Active Days:
Base
B02512   2014-04-30
B02598   2014-04-30
B02617   2014-09-05
B02682   2014-04-30
B02764   2014-09-27
Name: Date, dtype: datetime64[ns]

Least Active Days:
Base
B02512   2014-07-05
B02598   2014-07-05
B02617   2014-04-20
B02682   2014-07-05
B02764   2014-07-05
Name: Date, dtype: datetime64[ns]


In [20]:
uber_summary = pd.DataFrame({
    "Base": bases,
    "Total Trips": num_trips_per_base,
    "Maximum Trips": max_trips_per_base,
    "Minimum Trips": min_trips_per_base,
    "Start Date": start_date,
    "End Date": end_date,
    "Total Days": total_days,
    "Average Trips/Day": avg_trips_per_day,
    "Most Active Day": most_active_dates,
    "Least Active Day": least_active_dates   
})

#uber_summary = uber_summary.set_index("Base")

uber_summary

Unnamed: 0_level_0,Base,Total Trips,Maximum Trips,Minimum Trips,Start Date,End Date,Total Days,Average Trips/Day,Most Active Day,Least Active Day
Base,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B02512,B02512,200353,2107,391,2014-04-01,2014-09-30,183,1094.825137,2014-04-30,2014-07-05
B02598,B02598,1379578,13383,3509,2014-04-01,2014-09-30,183,7538.677596,2014-04-30,2014-07-05
B02617,B02617,1417983,16037,1934,2014-04-01,2014-09-30,183,7748.540984,2014-09-05,2014-04-20
B02682,B02682,1198901,13151,2867,2014-04-01,2014-09-30,183,6551.371585,2014-04-30,2014-07-05
B02764,B02764,254931,9229,114,2014-04-01,2014-09-30,183,1393.065574,2014-09-27,2014-07-05


# Visualtizations

# Output result

In [32]:
#dataframe into csv 
uber_data.to_csv('Data/uber_data_aprsep_2014.csv', index=False)

rides_per_day_df.to_csv('Data/rides_per_day_aprsep_2014.csv', index=False)

uber_summary.to_csv('Data/uber_summary_aprsep_2014.csv', index=False)