In [1]:
# Clear some memory before loading new data
import gc
gc.collect()



238

In [2]:
!pip install pyarrow



In [3]:
import pandas as pd

In [4]:
# Read  data

df=pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")

df_feb = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [5]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [6]:
len(df.columns)

19

In [7]:
# Check data types of the DataFrame
print(df.dtypes)

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object


In [8]:
# # Convert datetime columns to datetime type if needed
# df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
# df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Calculate duration in minutes
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Calculate standard deviation
duration_std = df['duration'].std()
print(f"Standard deviation of trip duration: {duration_std:.2f}")

Standard deviation of trip duration: 42.59


In [9]:
# Count total records before filtering
total_records = len(df)
print(total_records)

# Filter to keep only trips between 1 and 60 minutes
df_filtered = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

# Count remaining records and calculate percentage
remaining_records = len(df_filtered)
filtered_record=total_records-remaining_records
print(filtered_record)
fraction_remaining = remaining_records / total_records
print(f"Fraction of records remaining: {fraction_remaining:.2%}")

3066766
57593
Fraction of records remaining: 98.12%


In [10]:
from sklearn.feature_extraction import DictVectorizer

# Select only the location ID columns and convert to strings
df_loc = df_filtered[['PULocationID', 'DOLocationID']].copy()
df_loc['PULocationID'] = df_loc['PULocationID'].astype(str)
df_loc['DOLocationID'] = df_loc['DOLocationID'].astype(str)

# Convert to list of dictionaries
dict_list = df_loc.to_dict(orient='records')

# Fit a dictionary vectorizer
dv = DictVectorizer()
X = dv.fit_transform(dict_list)

# Print dimensionality (number of columns)
print(f"Dimensionality of feature matrix: {X.shape[1]}")

Dimensionality of feature matrix: 515


In [11]:
# Clear some memory before loading new data
import gc
gc.collect()

0

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare target variable
y = df_filtered['duration'].values

# Train linear regression model
lr = LinearRegression()
lr.fit(X, y)

# Make predictions and calculate RMSE
y_pred = lr.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"RMSE on training data: {rmse:.2f}")

RMSE on training data: 7.65


In [13]:
# Clear some memory before loading new data
import gc
gc.collect()

0

In [15]:
# Clear memory
import gc
gc.collect()

# Import necessary libraries 
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to process February data in a memory-efficient way
def process_february_data(dv, model):
    # Path to February data
    feb_path = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"
    
    # Read only necessary columns to save memory
    df_feb = pd.read_parquet(
        feb_path, 
        columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID']
    )
    
    print(f"Loaded February data with {len(df_feb)} records")
    
    # Calculate duration
    df_feb['duration'] = (
        pd.to_datetime(df_feb['tpep_dropoff_datetime']) - 
        pd.to_datetime(df_feb['tpep_pickup_datetime'])
    ).dt.total_seconds() / 60
    
    # Filter outliers
    df_feb = df_feb[(df_feb['duration'] >= 1) & (df_feb['duration'] <= 60)]
    print(f"After filtering: {len(df_feb)} records")
    
    # Free memory
    df_feb.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1, inplace=True)
    gc.collect()
    
    # Process in smaller batches to avoid memory issues
    batch_size = 100000
    n_batches = (len(df_feb) // batch_size) + 1
    
    sum_squared_errors = 0
    total_records = 0
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df_feb))
        
        if start_idx >= len(df_feb):
            break
            
        print(f"Processing batch {i+1}/{n_batches}")
        
        # Get batch
        batch = df_feb.iloc[start_idx:end_idx].copy()
        
        # Prepare features
        batch['PULocationID'] = batch['PULocationID'].astype(str)
        batch['DOLocationID'] = batch['DOLocationID'].astype(str)
        dict_list = batch[['PULocationID', 'DOLocationID']].to_dict(orient='records')
        
        # Transform and predict
        X = dv.transform(dict_list)
        y = batch['duration'].values
        y_pred = model.predict(X)
        
        # Update running sum of squared errors
        sum_squared_errors += np.sum((y - y_pred) ** 2)
        total_records += len(y)
        
        # Clear memory
        del batch, dict_list, X, y, y_pred
        gc.collect()
    
    # Calculate final RMSE
    rmse = np.sqrt(sum_squared_errors / total_records)
    return rmse

# Call the function with your trained model and DictVectorizer
rmse_feb = process_february_data(dv, lr)
print(f"RMSE on validation data: {rmse_feb:.2f}")

Loaded February data with 2913955 records
After filtering: 2855951 records
Processing batch 1/29
Processing batch 2/29
Processing batch 3/29
Processing batch 4/29
Processing batch 5/29
Processing batch 6/29
Processing batch 7/29
Processing batch 8/29
Processing batch 9/29
Processing batch 10/29
Processing batch 11/29
Processing batch 12/29
Processing batch 13/29
Processing batch 14/29
Processing batch 15/29
Processing batch 16/29
Processing batch 17/29
Processing batch 18/29
Processing batch 19/29
Processing batch 20/29
Processing batch 21/29
Processing batch 22/29
Processing batch 23/29
Processing batch 24/29
Processing batch 25/29
Processing batch 26/29
Processing batch 27/29
Processing batch 28/29
Processing batch 29/29
RMSE on validation data: 7.81
