# NYC Taxi Data - Exploratory Data Analysis

Analysis of NYC taxi trip data with 30-minute intervals from July 2014 to January 2015.

In [None]:
import sys
import os
sys.path.append(os.path.join('..', 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from utils.data_loader import load_time_series, validate_time_series, create_time_features
from visualization.plots import plot_time_series, plot_decomposition, plot_seasonal_patterns, plot_acf_pacf

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading and Basic Information

In [None]:
# Load NYC taxi data
df = pd.read_csv('../data/raw/nyc_taxi.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamp')

print(f"Data shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Total days: {(df.index.max() - df.index.min()).days}")
print(f"Frequency: {pd.infer_freq(df.index)}")

df.head(10)

In [None]:
# Data validation
validation_results = validate_time_series(df, value_col='value')
print("Data Validation Results:")
print("=" * 30)
for key, value in validation_results.items():
    print(f"{key:20}: {value}")

In [None]:
# Basic statistics
print("Descriptive Statistics:")
print("=" * 25)
print(df['value'].describe())

## 2. Time Series Visualization

In [None]:
# Plot the entire time series
plot_time_series(df, value_col='value', title='NYC Taxi Trips - Complete Series', 
                figsize=(15, 6))

In [None]:
# Interactive plot with Plotly
fig = px.line(df.reset_index(), x='timestamp', y='value',
              title='NYC Taxi Trips - Interactive View')
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Taxi Trips')
fig.show()

In [None]:
# Zoom into first week to see daily patterns
first_week = df.iloc[:336]  # 7 days * 48 half-hours
plot_time_series(first_week, value_col='value', title='NYC Taxi Trips - First Week Detail',
                figsize=(15, 6))

## 3. Seasonal Patterns Analysis

In [None]:
# Create time features for seasonal analysis
df_features = create_time_features(df)
print(f"Features created: {list(df_features.columns)}")

In [None]:
# Plot seasonal patterns
plot_seasonal_patterns(df, value_col='value', freq='all', figsize=(16, 12))

In [None]:
# Detailed hourly pattern
hourly_avg = df_features.groupby('hour')['value'].agg(['mean', 'std', 'min', 'max'])

plt.figure(figsize=(15, 8))
plt.plot(hourly_avg.index, hourly_avg['mean'], 'o-', linewidth=2, markersize=6, label='Average')
plt.fill_between(hourly_avg.index, 
                hourly_avg['mean'] - hourly_avg['std'],
                hourly_avg['mean'] + hourly_avg['std'], 
                alpha=0.3, label='±1 Std Dev')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Taxi Trips')
plt.title('NYC Taxi Trips - Hourly Pattern with Variability')
plt.xticks(range(0, 24))
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("Peak hours:")
print(hourly_avg['mean'].nlargest(5))

In [None]:
# Day of week analysis
dow_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_avg = df_features.groupby('day_of_week')['value'].agg(['mean', 'std'])

plt.figure(figsize=(12, 6))
bars = plt.bar(dow_names, dow_avg['mean'], yerr=dow_avg['std'], 
               capsize=5, alpha=0.7, color='skyblue', edgecolor='navy')
plt.xlabel('Day of Week')
plt.ylabel('Average Number of Taxi Trips')
plt.title('NYC Taxi Trips - Day of Week Pattern')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, avg in zip(bars, dow_avg['mean']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 200,
             f'{avg:.0f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 4. Time Series Decomposition

In [None]:
# Decompose with daily seasonality (48 half-hour periods)
plot_decomposition(df, value_col='value', period=48, model='additive', figsize=(15, 12))

In [None]:
# Weekly seasonality analysis
from statsmodels.tsa.seasonal import seasonal_decompose

# Weekly decomposition (48 * 7 = 336 half-hour periods)
weekly_decomp = seasonal_decompose(df['value'], model='additive', period=336)

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Plot components
weekly_decomp.trend.plot(ax=axes[0,0], title='Weekly Trend')
weekly_decomp.seasonal.iloc[:336].plot(ax=axes[0,1], title='Weekly Seasonal Pattern (First Week)')
weekly_decomp.resid.plot(ax=axes[1,0], title='Residuals')

# Histogram of residuals
axes[1,1].hist(weekly_decomp.resid.dropna(), bins=50, alpha=0.7)
axes[1,1].set_title('Distribution of Residuals')
axes[1,1].set_xlabel('Residual Value')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 5. Autocorrelation Analysis

In [None]:
# ACF and PACF plots
plot_acf_pacf(df, value_col='value', lags=100, figsize=(15, 10))

## 6. Stationarity Analysis

In [None]:
from utils.preprocessing import check_stationarity, make_stationary

# Check stationarity of original series
print("Stationarity Test - Original Series:")
print("=" * 40)
stationarity_result = check_stationarity(df['value'])
for key, value in stationarity_result.items():
    print(f"{key:20}: {value}")

print(f"\nIs stationary: {stationarity_result['is_stationary']}")

In [None]:
# Try differencing to make stationary
df_diff = make_stationary(df, value_col='value', method='diff')

print("Stationarity Test - After Differencing:")
print("=" * 42)
diff_stationarity = check_stationarity(df_diff['value_diff'].dropna())
for key, value in diff_stationarity.items():
    print(f"{key:20}: {value}")

print(f"\nIs stationary: {diff_stationarity['is_stationary']}")

In [None]:
# Visualize original vs differenced series
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

# Original series
ax1.plot(df.index, df['value'])
ax1.set_title('Original Series')
ax1.set_ylabel('Number of Trips')
ax1.grid(True, alpha=0.3)

# Differenced series
ax2.plot(df_diff.index, df_diff['value_diff'])
ax2.set_title('Differenced Series (First Difference)')
ax2.set_ylabel('Change in Trips')
ax2.set_xlabel('Date')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Outlier Detection

In [None]:
from utils.data_loader import detect_outliers

# Detect outliers using IQR method
outliers = detect_outliers(df, value_col='value', method='iqr', threshold=1.5)

print(f"Number of outliers detected: {outliers.sum()}")
print(f"Percentage of outliers: {(outliers.sum() / len(df)) * 100:.2f}%")

# Show some outlier examples
outlier_data = df[outliers]
print(f"\nTop 10 outlier values:")
print(outlier_data.nlargest(10, 'value'))

In [None]:
# Visualize outliers
plt.figure(figsize=(15, 8))

# Plot normal points
normal_points = df[~outliers]
outlier_points = df[outliers]

plt.plot(normal_points.index, normal_points['value'], 
         color='blue', alpha=0.7, label='Normal')
plt.scatter(outlier_points.index, outlier_points['value'], 
           color='red', s=30, alpha=0.8, label='Outliers', zorder=5)

plt.title('NYC Taxi Trips with Outliers Highlighted')
plt.xlabel('Date')
plt.ylabel('Number of Trips')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 8. Key Insights Summary

In [None]:
print("NYC TAXI DATA - KEY INSIGHTS")
print("=" * 50)

print(f"📊 Dataset Overview:")
print(f"   • Period: {df.index.min().strftime('%Y-%m-%d')} to {df.index.max().strftime('%Y-%m-%d')}")
print(f"   • Total records: {len(df):,}")
print(f"   • Frequency: 30-minute intervals")
print(f"   • Missing values: {df['value'].isnull().sum()}")

print(f"\n📈 Trip Volume:")
print(f"   • Average trips per 30min: {df['value'].mean():.0f}")
print(f"   • Peak trips (30min): {df['value'].max():,}")
print(f"   • Minimum trips (30min): {df['value'].min():,}")
print(f"   • Total trips in dataset: {df['value'].sum():,}")

print(f"\n🕐 Daily Patterns:")
peak_hour = hourly_avg['mean'].idxmax()
low_hour = hourly_avg['mean'].idxmin()
print(f"   • Peak hour: {peak_hour}:00 ({hourly_avg['mean'][peak_hour]:.0f} trips/30min avg)")
print(f"   • Lowest hour: {low_hour}:00 ({hourly_avg['mean'][low_hour]:.0f} trips/30min avg)")
print(f"   • Rush hour ratio: {hourly_avg['mean'][peak_hour] / hourly_avg['mean'][low_hour]:.1f}x")

print(f"\n📅 Weekly Patterns:")
busiest_day = dow_avg['mean'].idxmax()
quietest_day = dow_avg['mean'].idxmin()
print(f"   • Busiest day: {dow_names[busiest_day]} ({dow_avg['mean'][busiest_day]:.0f} avg trips/30min)")
print(f"   • Quietest day: {dow_names[quietest_day]} ({dow_avg['mean'][quietest_day]:.0f} avg trips/30min)")
print(f"   • Weekend vs Weekday ratio: {(dow_avg['mean'][5:].mean() / dow_avg['mean'][:5].mean()):.2f}")

print(f"\n🔍 Data Quality:")
print(f"   • Outliers detected: {outliers.sum()} ({(outliers.sum()/len(df)*100):.1f}%)")
print(f"   • Series is stationary: {stationarity_result['is_stationary']}")
print(f"   • Strong daily seasonality: Yes (period=48)")
print(f"   • Strong weekly seasonality: Yes (period=336)")

print(f"\n💡 Modeling Recommendations:")
print(f"   • Use seasonal models (SARIMA, Exponential Smoothing)")
print(f"   • Include hourly and daily features for ML models")
print(f"   • Consider differencing for stationarity")
print(f"   • Handle outliers with robust methods")
print(f"   • Cross-validate with time-aware splits")

## Next Steps

1. **Data Preprocessing**: Clean outliers, handle any missing values
2. **Feature Engineering**: Create lag features, rolling statistics, time-based features
3. **Model Development**: Try different forecasting approaches (SARIMA, Prophet, ML models)
4. **Model Evaluation**: Use appropriate time series validation techniques
5. **Deployment**: Create a forecasting pipeline for real-time predictions

Continue to the next notebook: `nyc_taxi_forecasting.ipynb`