In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import matplotlib.ticker as ticker
import plotly.express as px
from scipy import stats
from matplotlib.ticker import FuncFormatter
import warnings

# Mengabaikan peringatan
warnings.filterwarnings("ignore")

# Membaca Data
df = pd.read_csv("/Users/amiraherlambang/Documents/CAPSTONE 2/NYC_TLC_Trip_Record.csv")

# Mengonversi kolom waktu ke format datetime
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

# Menambahkan kolom durasi perjalanan
df['trip_duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds() / 60  # dalam menit

# Mendefinisikan nama vendor dan metode pembayaran
vendor_names = {1: 'Creative Mobile Technologies, LLC', 2: 'VeriFone Inc'}
payment_names = {
    1: 'Credit Card',
    2: 'Cash',
    3: 'No Charge',
    4: 'Dispute',
    5: 'Unknown',
    6: 'Voided Trip'
}

# Mengganti VendorID dan payment_type dengan nama yang sesuai
df['VendorName'] = df['VendorID'].map(vendor_names)
df['PaymentMethod'] = df['payment_type'].map(payment_names)

# 1. Total trips per vendor
total_trips_per_vendor = df['VendorName'].value_counts()
total_trips_per_vendor.plot(kind='bar', title='Total Trips per Vendor')
plt.xlabel('Vendor')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.show()

# 2. Total revenue per vendor
total_revenue_per_vendor = df.groupby('VendorName')['total_amount'].sum()
total_revenue_per_vendor.plot(kind='bar', title='Total Revenue per Vendor')
plt.xlabel('Vendor')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.show()

# 3. Average trip distance per vendor
average_trip_distance_per_vendor = df.groupby('VendorName')['trip_distance'].mean()
average_trip_distance_per_vendor.plot(kind='bar', title='Average Trip Distance per Vendor')
plt.xlabel('Vendor')
plt.ylabel('Average Trip Distance (miles)')
plt.xticks(rotation=45)
plt.show()

# 4. Average trip duration per vendor
average_trip_duration_per_vendor = df.groupby('VendorName')['trip_duration'].mean()
average_trip_duration_per_vendor.plot(kind='bar', title='Average Trip Duration per Vendor')
plt.xlabel('Vendor')
plt.ylabel('Average Trip Duration (minutes)')
plt.xticks(rotation=45)
plt.show()

# 5. Frequency of payment method per vendor
payment_freq_per_vendor = df.groupby(['VendorName', 'PaymentMethod']).size().unstack()
payment_freq_per_vendor.plot(kind='bar', stacked=True, title='Frequency of Payment Method per Vendor')
plt.xlabel('Vendor')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.legend(title='Payment Method')
plt.show()

# 6. Total revenue per payment method
total_revenue_per_payment = df.groupby('PaymentMethod')['total_amount'].sum()
total_revenue_per_payment.plot(kind='bar', title='Total Revenue per Payment Method')
plt.xlabel('Payment Method')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.show()

# 7. Average trip distance by payment method
average_trip_distance_per_payment = df.groupby('PaymentMethod')['trip_distance'].mean()
average_trip_distance_per_payment.plot(kind='bar', title='Average Trip Distance by Payment Method')
plt.xlabel('Payment Method')
plt.ylabel('Average Trip Distance (miles)')
plt.xticks(rotation=45)
plt.show()

# 8. Average trip duration by payment method
average_trip_duration_per_payment = df.groupby('PaymentMethod')['trip_duration'].mean()
average_trip_duration_per_payment.plot(kind='bar', title='Average Trip Duration by Payment Method')
plt.xlabel('Payment Method')
plt.ylabel('Average Trip Duration (minutes)')
plt.xticks(rotation=45)
plt.show()

# 9. Daily trip count for January 2023
january_data = df[(df['lpep_pickup_datetime'].dt.year == 2023) & (df['lpep_pickup_datetime'].dt.month == 1)]
daily_passenger_count = january_data.groupby(january_data['lpep_pickup_datetime'].dt.date)['passenger_count'].sum()
# Membuat rentang tanggal untuk Januari 2023
date_range = pd.date_range(start='2023-01-01', end='2023-01-31')
# Menggabungkan dengan data penumpang harian
daily_passenger_count = daily_passenger_count.reindex(date_range, fill_value=0)
# Memplot data
daily_passenger_count.plot(title='Daily Passenger Count - January 2023')
plt.xlabel('Date')
plt.ylabel('Passenger Count')
plt.xticks(rotation=45)
plt.title('Daily Passenger Count in January 2023')
plt.grid()
plt.show()

# 10. Number of trips per day of the week
df['pickup_day'] = df['lpep_pickup_datetime'].dt.day_name()
trips_per_day = df['pickup_day'].value_counts()
trips_per_day = trips_per_day.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
trips_per_day.plot(kind='bar', title='Number of Trips per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.show()

# 11. Number of trips per hour
df['pickup_hour'] = df['lpep_pickup_datetime'].dt.hour
trips_per_hour = df['pickup_hour'].value_counts().sort_index()
trips_per_hour.plot(title='Number of Trips per Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.show()

# 12. Top 10 Pickup Points for Both Vendors
top_pickup_points = df['PULocationID'].value_counts().head(10)
top_pickup_points.plot(kind='bar', title='Top 10 Pickup Points')
plt.xlabel('Pickup Location ID')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.show()

# 13. Top 10 Dropoff Points for Both Vendors
top_dropoff_points = df['DOLocationID'].value_counts().head(10)
top_dropoff_points.plot(kind='bar', title='Top 10 Dropoff Points')
plt.xlabel('Dropoff Location ID')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.show()