## Installation and Setup

Install Modin with Ray backend

In [1]:
# Install Modin with Ray backend
%pip install -q modin[ray] pyarrow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import modin.pandas as mpd
import numpy as np
import time
import os
import warnings

warnings.filterwarnings("ignore")

print(f"Pandas version: {pd.__version__}")
print(f"Modin importted suggessfully")

Pandas version: 2.3.3
Modin importted suggessfully


## Basic Operations with Modin

In [3]:
# Create Sample Data
np.random.seed(42)
n_rows = 1000000
n_cols = 20

data = np.random.randn(n_rows, n_cols)
columns = [f"col_{i}" for i in range(n_cols)]

# Create Pandas DataFrame
pdf = pd.DataFrame(data, columns=columns)
print(f"pandas Dataframe: {pdf.shape}")
print(f"Type: {type(pdf)}")

# Create Modin DataFrame
mdf = mpd.DataFrame(data, columns=columns)
print(f"Modin Dataframe: {mdf.shape}")
print(f"Type: {type(mdf)}")

pandas Dataframe: (1000000, 20)
Type: <class 'pandas.core.frame.DataFrame'>


2026-01-30 11:23:05,910	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2026-01-30 11:23:11,906	INFO worker.py:2007 -- Started a local Ray instance.
2026-01-30 11:23:16,420	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Modin Dataframe: (1000000, 20)
Type: <class 'modin.pandas.dataframe.DataFrame'>


## Verifying API Compatibility

In [4]:
# Verify that operations are identical
print("Comparing pandas and Modin results:\n")

# Basic statistics
print("Mean of col_0:")
print(f"Pandas: {pdf['col_0'].mean():.10f}")
print(f"Modin: {mdf['col_0'].mean():.10f}")

print("\nSum of col_1:")
print(f"Pandas: {pdf['col_1'].sum():.10f}")
print(f"Modin: {mdf['col_1'].sum():.10f}")

print("\nStandard Deviation of col_2:")
print(f"Pandas: {pdf['col_2'].std():.10f}")
print(f"Modin: {mdf['col_2'].std():.10f}")

print("\nResults are identical!")

# More Operations 
print("Additional operations:\n")

#Filtering
pdf_filtered = pdf[pdf['col_0'] > 0]
mdf_filtered = mdf[mdf['col_0'] > 0]
print(f"Filtered rows (col_0 > 0):")
print(f" pandas: {len(pdf_filtered):}")
print(f"Modin: {len(mdf_filtered):}")

# Sorting
pdf_sorted = pdf.sort_values('col_0')
mdf_sorted = mdf.sort_values('col_0')
print(f"\nFirst value after sorting:")
print(f" pandas: {pdf_sorted['col_0'].iloc[0]:.6f}")
print(f" modin: {mdf_sorted['col_0'].iloc[0]:.6f}")

# GroupBy
pdf['group'] = np.random.choice(['A', 'B', 'C'], len(pdf))
mdf['group'] = np.random.choice(['A', 'B', 'C'], len(mdf))
print("\nGroupBy mean (group A, col_0):")
print(f" pandas: {pdf.groupby('group')['col_0'].mean()['A']:.6f}")
print(f" modin: {mdf.groupby('group')['col_0'].mean()['A']:.6f}")

Comparing pandas and Modin results:

Mean of col_0:
Pandas: -0.0003920755
Modin: -0.0003920755

Sum of col_1:
Pandas: 623.7120710839
Modin: 623.7120710839

Standard Deviation of col_2:
Pandas: 0.9990888287
Modin: 0.9990888287

Results are identical!
Additional operations:

Filtered rows (col_0 > 0):
 pandas: 499599
Modin: 499599

First value after sorting:
 pandas: -4.705028
 modin: -4.705028

GroupBy mean (group A, col_0):
 pandas: -0.001595
 modin: -0.002744


## Performance Comparison with pandas

In [5]:
def benchmark(func, name, n_runs=3):
    """Run function multiple times."""
    times = []

    for _ in range(n_runs):
        start = time.time()
        result = func()
        times.append(time.time() - start)
    
    avg_time = sum(times) / len(times)
    return avg_time


def compare_performance(pandas_func, modin_func, operation_name, n_runs=3):
    """Compare pandas and Modin performance"""
    pandas_time = benchmark(pandas_func, "pandas", n_runs)
    modin_time = benchmark(modin_func, "modin", n_runs)
    speedup = pandas_time / modin_time if modin_time > 0 else float('inf')

    print(f"{operation_name}:")
    print(f" pandas: {pandas_time:.4f} seconds")
    print(f" Modin: {modin_time:.4f} seconds")
    print(f" Speedup: {speedup:.2f}x")
    print()

    return pandas_time, modin_time

# Create larger DataFrames for meaningful benchmarks
np.random.seed(42)
n_rows = 2000000
n_cols = 20

print(f"Creating DataFrames with {n_rows:} rows and {n_cols} columns...")
print("This may take a while...")

data = np.random.randn(n_rows, n_cols)
columns = [f"col_{i}" for i in range(n_cols)]
pdf_bench = pd.DataFrame(data, columns=columns)
mdf_bench = mpd.DataFrame(data, columns=columns)

# Add categorical column for groupby tests
categories = np.random.choice(['A', 'B', 'C', 'D', 'E'], n_rows)
pdf_bench['category'] = categories
mdf_bench['category'] = categories

print(f"DataFrames created: {n_rows: } rows x {n_cols + 1} columns\n")
print("="*50)
print("PERFORMANCE BENCHMARKS")
print("="*50 + "\n")

# Benchmark 1: Column Statistics
compare_performance(
    lambda: pdf_bench.mean(numeric_only=True),
    lambda: mdf_bench.mean(numeric_only=True),
    "Column-wise mean (all columns)"
)

# Benchmark 2: Row-wise Operations
compare_performance(
    lambda: pdf_bench.mean(axis=1, numeric_only=True),
    lambda: mdf_bench.mean(axis=1, numeric_only=True),
    "Row-wise sum"
)

# Benchmark 3: Boolean Filtering
compare_performance(
    lambda: pdf_bench[pdf_bench['col_0'] > 0],
    lambda: mdf_bench[mdf_bench['col_0'] > 0],
    "Boolean filtering (col_0 > 0)"
)

# Benchmark 4: GroupBy Aggregation
compare_performance(
    lambda: pdf_bench.groupby('category').mean(),
    lambda: mdf_bench.groupby('category').mean(),
    "GroupBy mean (category)"
)

# Benchmark 5: Sorting
compare_performance(
    lambda: pdf_bench.sort_values('col_0'),
    lambda: mdf_bench.sort_values('col_0'),
    "Sorting by col"
)

# Benchmark 6: Apply Function
compare_performance(
    lambda: pdf_bench['col_0'].apply(lambda x: x**2),
    lambda: mdf_bench['col_0'].apply(lambda x: x**2),
    "Apply function (square values)"
)

# Benchmark 7: Multiple Aggregations
compare_performance(
    lambda: pdf_bench.groupby('category').agg({'col_0': 'mean', 'col_1': 'sum', 'col_2': 'std'}),
    lambda: mdf_bench.groupby('category').agg({'col_0': 'mean', 'col_1': 'sum', 'col_2': 'std'}),
    "GroupBy with multiple aggregations"
)

Creating DataFrames with 2000000 rows and 20 columns...
This may take a while...
DataFrames created:  2000000 rows x 21 columns

PERFORMANCE BENCHMARKS

Column-wise mean (all columns):
 pandas: 0.1323 seconds
 Modin: 0.0258 seconds
 Speedup: 5.12x

Row-wise sum:
 pandas: 0.3542 seconds
 Modin: 0.0566 seconds
 Speedup: 6.26x

Boolean filtering (col_0 > 0):
 pandas: 0.3130 seconds
 Modin: 0.0487 seconds
 Speedup: 6.43x

GroupBy mean (category):
 pandas: 0.2653 seconds
 Modin: 0.0485 seconds
 Speedup: 5.47x

Sorting by col:
 pandas: 0.8981 seconds
 Modin: 0.6470 seconds
 Speedup: 1.39x

Apply function (square values):
 pandas: 0.9217 seconds
 Modin: 0.2341 seconds
 Speedup: 3.94x

GroupBy with multiple aggregations:
 pandas: 0.2982 seconds
 Modin: 0.5193 seconds
 Speedup: 0.57x



(0.29824113845825195, 0.519294023513794)

# Exercise 1: Basic Modin Operations
Practice using Modin with basic DataFrame operations.

### Task 1.1
Create a Modin DataFrame with 500,000 rows and 10 columns of random data. Then:
- Calculate the mean and standard deviation of each column
- Filter rows where the first column is between -1 and 1
- Add a new column that is the sum of the first three columns

In [6]:
# Create Modin DataFrame with 500,000 rows and 10 columns of random data
np.random.seed(42)
n_rows = 500000
n_cols = 10
data = np.random.randn(n_rows, n_cols)
columns = [f"col_{i}" for i in range(n_cols)]
mdf = mpd.DataFrame(data, columns=columns)

#Calculate Mean
mean_values = mdf.mean()
print("Mean of each column:")
print(mean_values)

# Calculate Standard Deviation
std_values = mdf.std()
print("\nStandard Deviation of each column:")
print(std_values)

# Filter rows where the first column is between -1 and 1
filtered_df = mdf[(mdf['col_0'] > -1) & (mdf['col_0'] < 1)]
print("\nFiltered DataFrame:")
print(filtered_df)

# New column with sum of first three columns
mdf['sum_first_three'] = mdf[mdf.columns[:3]].sum(axis=1)
print("\nDataFrame with new column:")
print(mdf)


Mean of each column:
col_0   -0.000412
col_1    0.002796
col_2   -0.004849
col_3    0.000956
col_4   -0.000384
col_5   -0.000680
col_6    0.000900
col_7   -0.000710
col_8    0.000484
col_9    0.000085
dtype: float64

Standard Deviation of each column:
col_0    1.000225
col_1    1.000111
col_2    1.000334
col_3    1.000228
col_4    0.999932
col_5    1.000973
col_6    1.000462
col_7    1.000395
col_8    0.998399
col_9    1.000080
dtype: float64

Filtered DataFrame:
           col_0     col_1     col_2     col_3     col_4     col_5     col_6  \
0       0.496714 -0.138264  0.647689  1.523030 -0.234153 -0.234137  1.579213   
1      -0.463418 -0.465730  0.241962 -1.913280 -1.724918 -0.562288 -1.012831   
3      -0.601707  1.852278 -0.013497 -1.057711  0.822545 -1.220844  0.208864   
4       0.738467  0.171368 -0.115648 -0.301104 -1.478522 -0.719844 -0.460639   
5       0.324084 -0.385082 -0.676922  0.611676  1.031000  0.931280 -0.839218   
...          ...       ...       ...       ...      

## Working with Large Datasets

### Download NYC Taxi Dataset

In [7]:
# NYC Taxi Dataset
import urllib.request

url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet" 
filename = "yellow_tripdata_2023-01.parquet"

# Download the dataset
if not os.path.exists(filename):
    print(f"Downloading NYC Taxi data from TLC website...")
    urllib.request.urlretrieve(url, filename)
    print(f"Data saved to {filename}")
else:
    print(f"File {filename} already exists.")

File yellow_tripdata_2023-01.parquet already exists.


### Reading Large File: Pandas vs Modin

In [8]:
#Benchmark: Reading file
print("="*50)
print("Benchmark: Reading file")
print("="*50 + "\n")

# Pandas read
start = time.time()
pdf_taxi = pd.read_parquet(filename)
pandas_read_time = time.time() - start
print(f" pandas read_time: {pandas_read_time:.2f} seconds")
print(f" Shape: {pdf_taxi.shape}")
print(f" Memory: {pdf_taxi.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Modin read
start = time.time()
mdf_taxi = mpd.read_parquet(filename)
modin_read_time = time.time() - start
print(f" Modin read_time: {modin_read_time:.2f} seconds")
print(f" Shape: {mdf_taxi.shape}")
print(f" Memory: {mdf_taxi.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f" \nSpeedup: {pandas_read_time / modin_read_time:.2f}x")

# View the data structure
print("Dataset columns:")
print(mdf_taxi.columns.tolist())
print(f"\nDataset info:")
print(f" Records: {len(mdf_taxi):,}")
print(f" Speedup: {pandas_read_time / modin_read_time:.2f}x\n")

# Preview the data
print("First few rows:")
mdf_taxi.head()

Benchmark: Reading file

 pandas read_time: 2.26 seconds
 Shape: (3066766, 19)
 Memory: 588.46 MB
 Modin read_time: 1.67 seconds
 Shape: (3066766, 19)
 Memory: 588.46 MB
 
Speedup: 1.35x
Dataset columns:
['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee']

Dataset info:
 Records: 3,066,766
 Speedup: 1.35x

First few rows:


[36m(pid=gcs_server)[0m [2026-01-30 11:23:41,513 E 38260 33632] (gcs_server.exe) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


### CSV Version for Additional Benchmarks

In [9]:
# Save as CSV for benchmarking
csv_filename = "taxi_data.csv"

if not os.path.exists(csv_filename):
    print(f"Creating CSV file for benchmarking...")
    pdf_taxi.to_csv(csv_filename, index=False)
    print(f"Created: {csv_filename}")

print(f"CSV file size: {os.path.getsize(csv_filename) / 1024**2:.2f} MB")

# Benchmark: Reading CSV files
print("="*50)
print("Benchmark: Reading CSV File")
print("="*50 + "\n")

# Pandas read
start = time.time()
pdf_csv = pd.read_csv(csv_filename)
pandas_csv_time = time.time() - start
print(f" pandas read_time: {pandas_csv_time:.2f} seconds")

# Modin read
start = time.time()
mdf_csv = mpd.read_csv(csv_filename)
modin_csv_time = time.time() - start
print(f" Modin read_time: {modin_csv_time:.2f} seconds")

print(f" \nSpeedup: {pandas_csv_time / modin_csv_time:.2f}x")


CSV file size: 309.97 MB
Benchmark: Reading CSV File



[33m(raylet)[0m [2026-01-30 11:23:44,113 E 40708 33388] (raylet.exe) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


 pandas read_time: 9.56 seconds
 Modin read_time: 3.08 seconds
 
Speedup: 3.10x


### Loading Multiple Months of Data

In [10]:
months = ['2023-01', '2023-02']
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{}.parquet"

parquet_files = []
total_size = 0

for month in months:
    filename = f"yellow_tripdata_{month}.parquet"
    parquet_files.append(filename)

    if not os.path.exists(filename):
        url = base_url.format(month)
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filename)
        size_mb = os.path.getsize(filename) / 1024**2
        print(f" Downloaded: {size_mb:.2f} MB")
        total_size += size_mb
    else:
        size_mb = os.path.getsize(filename) / 1024**2
        print(f" {filename} already exists ({size_mb:.2f} MB)")
        total_size += size_mb

print(f"\nTotal parquet files: {len(parquet_files)}")
print(f"Total size: {total_size:.2f} MB")

# Load and combine all months with Modin
print("Loading and combining all months with Modin...")
start = time.time()

dfs = [mpd.read_parquet(f) for f in parquet_files]
mdf_combined = mpd.concat(dfs, ignore_index=True)

load_time = time.time() - start
print(f"Load time: {load_time:.2f} seconds")
print(f"\nCombined dataset:")
print(f" Records: {len(mdf_combined):,}")
print(f" Columns: {len(mdf_combined.columns)}")

 yellow_tripdata_2023-01.parquet already exists (45.46 MB)
 yellow_tripdata_2023-02.parquet already exists (45.54 MB)

Total parquet files: 2
Total size: 91.00 MB
Loading and combining all months with Modin...
Load time: 7.06 seconds

Combined dataset:
 Records: 5,980,721
 Columns: 20


### Converting Between Modin and pandas

In [11]:
# Convert Modin DataFrame to pandas
print("Converting Modin to pandas...")
start = time.time()
pdf_from_modin = mdf_taxi._to_pandas()
convert_time = time.time() - start

print(f"Conversion time: {convert_time:.2f} seconds")
print(f"Result type: {type(pdf_from_modin)}")

# Convert pandas DataFrame to Modin
print("\nConverting pandas to Modin...")
start = time.time()
mdf_from_pandas = mpd.DataFrame(pdf_taxi)
convert_time = time.time() - start

print(f"Conversion time: {convert_time:.2f} seconds")
print(f"Result type: {type(mdf_from_pandas)}")

Converting Modin to pandas...


Conversion time: 1.48 seconds
Result type: <class 'pandas.core.frame.DataFrame'>

Converting pandas to Modin...
Conversion time: 5.18 seconds
Result type: <class 'modin.pandas.dataframe.DataFrame'>


## Execise 2

### Task 2.1
Using the combined taxi dataset (mdf_combined):
- How many total trips are in the dataset?
- What is the date range of the data (min and max pickup datetime)?
- What percentage of trips have 0 passengers recorded?

In [None]:
#Total Number of Trips
total_trips = mdf_combined.count()
print(f"Total number of trips: {total_trips} \n")

# Date Range
min_pickup = mdf_combined["tpep_pickup_datetime"].min()
max_pickup = mdf_combined["tpep_pickup_datetime"].max()
print(f"Pickup date range: {min_pickup} to {max_pickup}")

# Percentage Trips with 0 Passengers
percentage_zero_passengers = (
    (mdf_combined["passenger_count"] == 0).mean() * 100
)
print("Percentage of trips with 0 passengers:", percentage_zero_passengers)

Total number of trips: VendorID                 5980721
tpep_pickup_datetime     5980721
tpep_dropoff_datetime    5980721
passenger_count          5832161
trip_distance            5980721
RatecodeID               5832161
store_and_fwd_flag       5832161
PULocationID             5980721
DOLocationID             5980721
payment_type             5980721
fare_amount              5980721
extra                    5980721
mta_tax                  5980721
tip_amount               5980721
tolls_amount             5980721
improvement_surcharge    5980721
total_amount             5980721
congestion_surcharge     5832161
airport_fee              2995023
Airport_fee              2837138
dtype: int64 

Pickup date range: 2008-12-31 23:01:42 to 2023-03-07 13:01:28
Percentage of trips with 0 passengers: 1.6459721160709553


## Advanced Modin Operations

### Data Exploration and Cleaning

In [13]:
# Basic Statistics of Dataset
print("Dataset Statistics:")
print("="*50)

# Using single-month dataset 
print(f"\nTotal Trips: {len(mdf_taxi):,}")

# Numeric Columns Summary
print("\n Fare amount statistics:")
fare_stats = mdf_taxi['fare_amount'].describe()
print(fare_stats)

# Check for Missing Values
print("Missing value per column:")
print("="*50)
missing = mdf_taxi.isnull().sum()
missing_pct = (missing / len(mdf_taxi) * 100).round(2)

missing_df = mpd.DataFrame({
    'Missing Count': missing,
    "Missing %": missing_pct
})
print(missing_df[missing_df['Missing Count']>0])

# Data Cleaning - Remove Invalid Records
print("\nCleaning Data...")
print(f"Original records: {len(mdf_taxi):,}")

# Filter valid trips
mdf_clean = mdf_taxi[
    (mdf_taxi['fare_amount'] > 0) &
    (mdf_taxi['trip_distance'] > 0) &
    (mdf_taxi['passenger_count'] > 0)
]

print(f"After cleaning: {len(mdf_clean):,}")
print(f"Removed: {len(mdf_taxi) - len(mdf_clean):,} invalid records")

Dataset Statistics:

Total Trips: 3,066,766

 Fare amount statistics:
count    3.066766e+06
mean     1.836707e+01
std      1.780782e+01
min     -9.000000e+02
25%      8.600000e+00
50%      1.280000e+01
75%      2.050000e+01
max      1.160100e+03
Name: fare_amount, dtype: float64
Missing value per column:
                      Missing Count  Missing %
passenger_count               71743       2.34
RatecodeID                    71743       2.34
store_and_fwd_flag            71743       2.34
congestion_surcharge          71743       2.34
airport_fee                   71743       2.34

Cleaning Data...
Original records: 3,066,766
After cleaning: 2,884,228
Removed: 182,538 invalid records


### GroupBy and Aggregations at Scale

In [17]:
# GroupBy operations on large dataset
print("="*50)
print("GROUPBY OPERATIONS")
print("="*50)

# Average fare by payment type
print("\n1. Average fare by payment type:")
start = time.time()
fare_by_payment = mdf_clean.groupby('payment_type')['fare_amount'].mean()
print(f" Time: {time.time() - start:.2f}s")
print(fare_by_payment.sort_values(ascending=False))

# Multiple Aggregatopms by Vendor
print("\n2. Statistics by vendor:")
start = time.time()
vendor_stats = mdf_clean.groupby('VendorID').agg({
    'fare_amount' : ['mean', 'sum', 'count'],
    'trip_distance' : ['mean', 'max'],
    'tip_amount' : 'mean'
})
print(f" Time: {time.time() - start:.2f}s")
print(vendor_stats)

# Top pickup locations
print("\n3. Top 10 pickup locations by trip count:")
start = time.time()
top_locations = mdf_clean['PULocationID'].value_counts().head(10)
print(f" Time: {time.time() - start:.2f}s")
print(top_locations)

# Fare statistics by pickup location(top 10)
print("\n4. Average fare by top pickup locations:")
start = time.time()
top_location_ids = top_locations.index.tolist()
top_loc_fares = mdf_clean[mdf_clean['PULocationID'].isin(top_location_ids)].groupby('PULocationID')['fare_amount'].mean()
print(f" Time: {time.time() - start:.2f}s")
print(top_loc_fares.sort_values(ascending=False))

GROUPBY OPERATIONS

1. Average fare by payment type:
 Time: 1.14s
payment_type
4    18.849528
2    18.572971
1    18.543035
3    17.436227
Name: fare_amount, dtype: float64

2. Statistics by vendor:
 Time: 0.05s
         fare_amount                       trip_distance           tip_amount
                mean          sum    count          mean       max       mean
VendorID                                                                     
1          17.462771  12913945.93   739513      3.235839    204.10   3.152932
2          18.920187  40578409.09  2144715      3.554463  14098.55   3.485630

3. Top 10 pickup locations by trip count:
 Time: 0.79s
PULocationID
132    152122
237    141109
236    130887
161    129103
186    104813
162    100807
142     94910
230     94226
138     86750
170     83993
Name: count, dtype: int64

4. Average fare by top pickup locations:
 Time: 0.60s
PULocationID
132    60.738717
138    41.487665
230    17.378985
186    15.554415
161    15.258497
162    14.

### Data Transformation and Feature Engineering

In [18]:
# Add derived columns
print('Adding derived columns...')
start = time.time()

# Make a copy for transformations
mdf_features = mdf_clean.copy()

# Calculate tip percentage
mdf_features['tip_percentage'] = (mdf_features['tip_amount'] / mdf_features['fare_amount'] * 100).round(2)

# Calculate fare per mile
mdf_features['fare_per_mile'] = (mdf_features['fare_amount'] / mdf_features['trip_distance']).round(2)

# Extract datetime components
mdf_features['pickup_hour'] = mpd.to_datetime(mdf_features['tpep_pickup_datetime']).dt.hour
mdf_features['pickup_dayofweek'] = mpd.to_datetime(mdf_features['tpep_pickup_datetime']).dt.dayofweek
mdf_features['pickup_date'] = mpd.to_datetime(mdf_features['tpep_pickup_datetime']).dt.date

print(f"Time: {time.time() - start:.2f}s")
print("\nNew columns added: tip_percentage, fare_per_mile, pickup_hour, pickup_dayofweek, pickup_date")

# Preview
print("\nSample of new features:")
mdf_features[['fare_amount', 'tip_amount', 'tip_percentage', 'trip_distance', 'fare_per_mile', 'pickup_hour']].head()

# Analyse patterns by time
print("\nAverage fare by hour of day:")
hourly_stats = mdf_features.groupby('pickup_hour').agg({
    'fare_amount': 'mean',
    'trip_distance': 'mean',
    'tip_percentage': 'mean'
}).round(2)
print(hourly_stats)

# Day of week analysis
print("\nTrips by day of week:")
day_names = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
daily_trips = mdf_features['pickup_dayofweek'].value_counts().sort_index()
for day, count in daily_trips.items():
    print(f" {day_names[day]} : {count:,}")


Adding derived columns...
Time: 0.25s

New columns added: tip_percentage, fare_per_mile, pickup_hour, pickup_dayofweek, pickup_date

Sample of new features:

Average fare by hour of day:
             fare_amount  trip_distance  tip_percentage
pickup_hour                                            
0                  19.80           4.03           20.42
1                  17.82           3.49           20.95
2                  16.72           3.22           20.77
3                  17.74           3.51           20.26
4                  22.25           4.68           17.97
5                  26.46           6.42           16.57
6                  22.15           4.78           18.11
7                  18.92           3.68           37.65
8                  17.43           3.18           20.04
9                  17.61           3.09           19.91
10                 17.73           3.14           19.52
11                 17.41           3.04           19.48
12                 17.76     

### Applying Custom Functions

In [20]:
# Apply custom function to categorise trips
def categorize_trip(distance):
    if  distance < 1:
        return 'Very Short'
    elif distance < 3:
        return 'Short'
    elif distance < 10:
        return 'Medium'
    else:
        return 'Long'

print("Applying custom function to catergorise trips...")
start = time.time()
mdf_features['trip_category'] = mdf_features['trip_distance'].apply(categorize_trip)
print(f"Time: {time.time() - start:.2f}s")

print("\nTrip categories:")
print(mdf_features['trip_category'].value_counts())

# Using apply with lambda functions
print("\nApplying lambda function to calculate total cost with tip...")
start = time.time()
mdf_features['total_with_tip'] = mdf_features.apply(
    lambda row: row['fare_amount'] + row['tip_amount'] + row['tolls_amount'],
    axis=1
)
print(f"Time: {time.time() - start:.2f}s")

print("\nTotal cost statistics:")
print(mdf_features['total_with_tip'].describe())

Applying custom function to catergorise trips...
Time: 0.12s

Trip categories:
trip_category
Short         1462266
Very Short     596061
Medium         588867
Long           237034
Name: count, dtype: int64

Applying lambda function to calculate total cost with tip...
Time: 30.96s

Total cost statistics:
count    2.884228e+06
mean     2.247460e+01
std      2.100029e+01
min      1.000000e-02
25%      1.078000e+01
50%      1.532000e+01
75%      2.390000e+01
max      1.166650e+03
Name: total_with_tip, dtype: float64
