In [1]:
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#ignore
#%config Completer.use_jedi = False

In [None]:
# import io
# from google.colab import files
# from google.colab import drive

# uploaded = files.upload()
# drive.mount('/content/drive')

# loading Dataset

In [None]:
trips = pd.read_parquet(path="F:/ML_Project/Datasets/NYCtaxi/yellow_tripdata_2023-01.parquet")
trips.info()

In [None]:
trips = trips.reset_index()

In [None]:
#load locationID lookup table
zonedb = pd.read_csv('F:/ML_Project/Datasets/NYCtaxi/taxi+_zone_lookup.csv', header=0)
zonedb.set_index('LocationID')

# dataset description

In [None]:
trips.head()

# column domain attributes

In [None]:
vendor_id = dict({1:"Creative Mobile Technologies, LLC",
                  2: "VeriFone Inc"})

ratecodeid = dict({1: "Standard rate",
                    2: "JFK",
                    3: "Newark",
                    4: "Nassau or Westchester",
                    5: "Negotiated fare",
                    6: "Group ride"})

store_and_fwd_flag = dict({"Y": "store and forward trip",
                    "N": "not a store and forward trip"})

payment_type = dict({0: "Credit card",
                    1: "Cash",
                    2: "No charge",
                    3: "Dispute",
                    4: "Unknown",
                    5: "Voided trip"})

weekd = {1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thrusday', 5:
         'Friday', 6:'Saturday', 7:'Sunday'}

# Questionare -
What we want to find?<br>
* Most popular mode of payment by volumn?
* Most busy days of a week?
* Most popular pickup and dropoff location?
* Revenue by ratecodes


# Most popular mode of payment by volumn?

In [None]:
#using groupby to group on payment type
db = trips.groupby(by=['payment_type']).count()[['index']]
#reseting index
db.index = list(map(lambda x: payment_type[x], db.index))
db.sort_values(by='index', ascending=False)

In [None]:
# visualize the numbers in pie chart
plt.pie(db['index'], explode=[0.5,0,0,0,1], autopct='%.0f%%', labels=db.index,
        colors=sns.color_palette('pastel', n_colors=5),
        wedgeprops = {'linewidth': 3, 'edgecolor': 'white'},
        textprops={'color':'black'})
plt.title("Popular payment methods")
plt.show()

# Most busy days of a week?
By number of trips

In [None]:
#copy part of trips dataset of above purpose
bdays = trips[['index','tpep_pickup_datetime','tpep_dropoff_datetime']].copy()
bdays.head(5)

In [None]:
#Data preprocessing: Removing unneccessary
print("Whole dataset:\n", bdays.resample('M', on='tpep_pickup_datetime').count()['index'])

#extracting 2023
bdays = bdays[bdays['tpep_pickup_datetime'].dt.year == 2023]
print("2023 data only: ",len(bdays), "instances")

### Weekday preprocess
help in grouping according to weekdays

In [None]:
#add weekday column
bdays['weekday'] = bdays['tpep_pickup_datetime'].apply(func=lambda x: x.isoweekday())

In [None]:
#groupby weekday
db1 = bdays.groupby(by='weekday').count()['index'].reset_index()
db1['weekday'] = db1['weekday'].apply(lambda x: weekd[x])
db1.sort_values(by='index', ascending=False)

In [None]:
#choosing a colormap for bar chart illustration
colormap = plt.get_cmap('flare')
#mapping values to color-range from lighter to darker
normalize = plt.Normalize(vmin=db1['index'].min(), vmax=db1['index'].max())
colormap

In [None]:
# visualization
fig, ax = plt.subplots(figsize=(8,6))
plt.barh(y=db1['weekday'], width=db1['index'],
        color= colormap(normalize(db1['index']))
)
# display colorbar
cbar = plt.colorbar(plt.cm.ScalarMappable(cmap=colormap, norm=normalize), ax=ax, orientation='vertical')
cbar.set_label('Value')

# Set labels and title
plt.xlabel('Weekdays')
plt.ylabel('No of Trips')
plt.title('Most busy Weekdays of 2023')

# Show the plot
plt.show()

# Most popular pickup and dropoff location?

In [None]:
# extracting neccessary data
plocs = trips[trips['tpep_pickup_datetime'].dt.year == 2023]
plocs = plocs[['index','PULocationID','DOLocationID','tpep_pickup_datetime']].copy()
plocs.shape

In [None]:
# unique locations
print(f"UNIQUE Locations:\n pickup:{plocs.PULocationID.unique().size}\n dropoff:{plocs.DOLocationID.unique().size}")

In [None]:
#Most popular pickup
plocs.PULocationID.value_counts()

In [None]:
#Most popular dropoff
plocs.DOLocationID.value_counts()

In [None]:
#adding weekday attribute
plocs['weekdy'] = plocs['tpep_pickup_datetime'].apply(lambda x : x.isoweekday())
plocs.info()

In [None]:
# groupby on weekday > PickUP LocationID > DropOff location ID
# getting weekly 
db3 = pd.pivot_table(data=plocs, index=['weekdy','PULocationID','DOLocationID'],
               values=['index'], aggfunc=['count'])

#weekly most popular trips
max_counts = db3['count']['index'].groupby(['weekdy']).idxmax()
db3 = db3.loc[max_counts]
print(db3)

In [None]:
#Most popular trips in whole month
db2 = plocs.iloc[:,1:3].value_counts()
db2.head()

In [None]:
#Joining DataTables
db3 = db3.loc[max_counts]
db3 = db3['count']['index'].reset_index(name='Trips').join(zonedb, on='PULocationID', how='inner', lsuffix=" pickup").join(zonedb, on='DOLocationID', how='inner', lsuffix=" dropoff")
db3['weekdy'] = list(map(lambda x: weekd[x], db3['weekdy']))

In [None]:
db3.sort_values(by='Trips', ascending = False)

# Revenue by Ratecodes

In [None]:
trips.head()

In [None]:
# unique ratecodes
trips['RatecodeID'].unique()

In [None]:
# Groupby RatecodeID on amount attribute
db5 = trips.pivot_table(values='total_amount', index='RatecodeID', aggfunc='sum', dropna=False).sort_values(by='total_amount', ascending=False)
db5

In [None]:
#choosing a colormap for bar chart illustration
colormap2 = plt.get_cmap('flare')
#mapping values to color-range from lighter to darker
normalize2 = plt.Normalize(vmin=db5['total_amount'].min(), vmax=db5['total_amount'].max())

fig, ax = plt.subplots(figsize=(8,6))
bars = plt.bar(x=db5.index.astype('str'), height=db5['total_amount'],
              color = colormap(normalize2(db5['total_amount'])))


# Add values on top of bars
values = db5['total_amount']/10000
for bar, value in zip(bars, values):
    if value >= 30000000:  # Display the value inside the bar if it's greater than or equal to 5
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"$ {value:.2f} K",
                 ha='center', va='bottom', rotation='vertical')
    else:  # Display the value above the bar if it's less than 5
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"$ {value:.2f} K",
                 ha='center', va='bottom', fontsize=8)

# display colorbar
cbar = plt.colorbar(plt.cm.ScalarMappable(cmap=colormap2, norm=normalize2), ax=ax, orientation='vertical')
cbar.set_label('Revenue')

plt.xlabel('Ratecodes')
plt.ylabel('revenue')
plt.show()