# Data Preparation: Drivers

In [19]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import hvplot.pandas
import warnings
warnings.filterwarnings("ignore")

In [27]:
# Files to Load

non_uber_flv_trips = [
    "Resources/non_uber_flv_trips/other-American_B01362.csv",
    "Resources/non_uber_flv_trips/other-Carmel_B00256.csv",
    "Resources/non_uber_flv_trips/other-Dial7_B00887.csv",
    "Resources/non_uber_flv_trips/other-Diplo_B01196.csv",
    "Resources/non_uber_flv_trips/other-Federal_02216.csv",
    "Resources/non_uber_flv_trips/other-Firstclass_B01536.csv",
    "Resources/non_uber_flv_trips/other-Highclass_B01717.csv",
    "Resources/non_uber_flv_trips/other-Lyft_B02510.csv",
    "Resources/non_uber_flv_trips/other-Prestige_B01338.csv",
    "Resources/non_uber_flv_trips/other-Skyline_B00111.csv",
]

# Initialize an empty list to store DataFrames
data_frames = []
file_encoding = "utf-8"
file_encoding = "latin1"

# Read each CSV file and append its DataFrame to the list
for file_path in non_uber_flv_trips:
    df = pd.read_csv(file_path, encoding=file_encoding)
    data_frames.append(df)

# Concatenate all DataFrames in the list into one DataFrame
raw_data = pd.concat(data_frames, ignore_index=True)
raw_data

Unnamed: 0,DATE,TIME,PICK UP ADDRESS,Unnamed: 3,Unnamed: 4,Unnamed: 5,Date,Time,PU_Adress,Base_No,...,PU_Address,DO_Address,Routing Details,PU_Address.1,Status,time_of_trip,start_lat,start_lng,Street_Address,City_State
0,7/1/2014,12:00:00 AM,"874 E 139th St Mott Haven, BX",,,,,,,,...,,,,,,,,,,
1,7/1/2014,12:01:00 AM,"628 E 141st St Mott Haven, BX",,,,,,,,...,,,,,,,,,,
2,7/1/2014,12:01:00 AM,"601 E 156th St South Bronx, BX",,,,,,,,...,,,,,,,,,,
3,7/1/2014,12:01:00 AM,"708 E 138th St Mott Haven, BX",,,,,,,,...,,,,,,,,,,
4,7/1/2014,12:02:00 AM,"700 E 140th St Mott Haven, BX",,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1676776,,,,,,,9/29/2014,13:15,,,...,,,,,,,,,370 7th AVE,M
1676777,,,,,,,9/29/2014,15:40,,,...,,,,,,,,,1 STATE ST PZ,M
1676778,,,,,,,9/29/2014,20:07,,,...,,,,,,,,,730 5TH AVE,M
1676779,,,,,,,9/29/2014,22:48,,,...,,,,,,,,,30 ROCKEFELLER PZ,M


# Data Cleaning

In [28]:
raw_data.columns

Index(['DATE', 'TIME', 'PICK UP ADDRESS', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Date', 'Time', 'PU_Adress', 'Base_No', 'State', 'PuFrom',
       'Address', 'Street', 'PU_Address', 'DO_Address', 'Routing Details',
       'PU_Address.1', 'Status', 'time_of_trip', 'start_lat', 'start_lng',
       '    Street_Address ', '    City_State '],
      dtype='object')

In [29]:
raw_data["Date"] = raw_data["DATE"].combine_first(raw_data["Date"])

raw_data = raw_data["Date"]
raw_data

0           7/1/2014
1           7/1/2014
2           7/1/2014
3           7/1/2014
4           7/1/2014
             ...    
1676776    9/29/2014
1676777    9/29/2014
1676778    9/29/2014
1676779    9/29/2014
1676780    9/29/2014
Name: Date, Length: 1676781, dtype: object

In [34]:
# Handling Missing Values
non_na_data = raw_data.dropna()

#non_na_data = non_dupe_data.dropna(axis=1)

print(non_na_data.count())

1409080


In [36]:
other_data = pd.DataFrame(non_na_data)
other_data.head()

Unnamed: 0,Date
0,7/1/2014
1,7/1/2014
2,7/1/2014
3,7/1/2014
4,7/1/2014


# Trips Data

In [40]:
per_date_counts = other_data["Date"].value_counts()
per_date_counts

9/19/2014     15103
9/12/2014     14899
9/13/2014     14872
8/1/2014      14521
9/5/2014      14509
              ...  
07/22/2014        1
07/18/2014        1
07/13/2014        1
07/12/2014        1
09/28/2014        1
Name: Date, Length: 267, dtype: int64

In [54]:
per_date_counts = pd.Series(per_date_counts)

# Create a DataFrame from the Series
rides_per_day_df = per_date_counts.reset_index()
rides_per_day_df.columns = ['Date', 'Number of trips']


rides_per_day_df.head()

Unnamed: 0,Date,Number of trips
0,9/19/2014,15103
1,9/12/2014,14899
2,9/13/2014,14872
3,8/1/2014,14521
4,9/5/2014,14509


# Trips Summary

In [58]:
# total trips 
num_trips_per_date = rides_per_day_df["Number of trips"]
num_trips_per_date

0      15103
1      14899
2      14872
3      14521
4      14509
       ...  
262        1
263        1
264        1
265        1
266        1
Name: Number of trips, Length: 267, dtype: int64

In [55]:
# maximum trips
max_trips_per_date = rides_per_day_df.groupby("Date")["Number of trips"].max()
max_trips_per_date

Date
07/01/2014       11
07/02/2014        4
07/03/2014       12
07/04/2014        8
07/05/2014        3
              ...  
9/5/2014      14509
9/6/2014      13469
9/7/2014      12699
9/8/2014      13380
9/9/2014      12864
Name: Number of trips, Length: 267, dtype: int64

In [56]:
# minimum trips
min_trips_per_date = rides_per_day_df.groupby("Date")["Number of trips"].min()
min_trips_per_date

Date
07/01/2014       11
07/02/2014        4
07/03/2014       12
07/04/2014        8
07/05/2014        3
              ...  
9/5/2014      14509
9/6/2014      13469
9/7/2014      12699
9/8/2014      13380
9/9/2014      12864
Name: Number of trips, Length: 267, dtype: int64

In [48]:
# Date Range
start_date = rides_per_day_df["Date"].min()
end_date = rides_per_day_df["Date"].max()

In [49]:
# Total Days
total_days = len(rides_per_day_df["Date"].unique())

In [51]:
# Average Trips per Day
avg_trips_per_day = max_trips_per_date / total_days

In [52]:
# Most Active Day
most_active_dates_idx = rides_per_day_df.groupby('Date')['Number of trips'].idxmax()
most_active_dates = rides_per_day_df.loc[most_active_dates_idx, 'Date']

# Set the index to "Base" and rename the index
most_active_dates.index = most_active_dates_idx.index
most_active_dates.index.name = 'Date'

# Least Active Day
least_active_dates_idx = rides_per_day_df.groupby('Date')['Number of trips'].idxmin()
least_active_dates = rides_per_day_df.loc[least_active_dates_idx, 'Date']

# Set the index to "Base"
least_active_dates.index = least_active_dates_idx.index
least_active_dates.index.name = 'Date'


print("Most Active Days:"),
print(most_active_dates)

print("\nLeast Active Days:")
print(least_active_dates)

Most Active Days:
Date
07/01/2014    07/01/2014
07/02/2014    07/02/2014
07/03/2014    07/03/2014
07/04/2014    07/04/2014
07/05/2014    07/05/2014
                 ...    
9/5/2014        9/5/2014
9/6/2014        9/6/2014
9/7/2014        9/7/2014
9/8/2014        9/8/2014
9/9/2014        9/9/2014
Name: Date, Length: 267, dtype: object

Least Active Days:
Date
07/01/2014    07/01/2014
07/02/2014    07/02/2014
07/03/2014    07/03/2014
07/04/2014    07/04/2014
07/05/2014    07/05/2014
                 ...    
9/5/2014        9/5/2014
9/6/2014        9/6/2014
9/7/2014        9/7/2014
9/8/2014        9/8/2014
9/9/2014        9/9/2014
Name: Date, Length: 267, dtype: object


In [60]:
other_summary = pd.DataFrame({
    "Total Trips": num_trips_per_date,
    "Maximum Trips": max_trips_per_date,
    "Minimum Trips": min_trips_per_date,
    "Start Date": start_date,
    "End Date": end_date,
    "Total Days": total_days,
    "Average Trips/Day": avg_trips_per_day,
    "Most Active Day": most_active_dates,
    "Least Active Day": least_active_dates   
})

#other_summary = other_summary.set_index("Date")

other_summary

Unnamed: 0,Total Trips,Maximum Trips,Minimum Trips,Start Date,End Date,Total Days,Average Trips/Day,Most Active Day,Least Active Day
0,15103.0,,,07/01/2014,9/9/2014,267,,,
1,14899.0,,,07/01/2014,9/9/2014,267,,,
2,14872.0,,,07/01/2014,9/9/2014,267,,,
3,14521.0,,,07/01/2014,9/9/2014,267,,,
4,14509.0,,,07/01/2014,9/9/2014,267,,,
...,...,...,...,...,...,...,...,...,...
9/5/2014,,14509.0,14509.0,07/01/2014,9/9/2014,267,54.340824,9/5/2014,9/5/2014
9/6/2014,,13469.0,13469.0,07/01/2014,9/9/2014,267,50.445693,9/6/2014,9/6/2014
9/7/2014,,12699.0,12699.0,07/01/2014,9/9/2014,267,47.561798,9/7/2014,9/7/2014
9/8/2014,,13380.0,13380.0,07/01/2014,9/9/2014,267,50.112360,9/8/2014,9/8/2014
