In [10]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle 

In [11]:
df = pd.read_csv('../data/raw/uber.csv') 

print("Original Data Overview:")
print(df.head())

Original Data Overview:
   Unnamed: 0                            key  fare_amount  \
0    24238194    2015-05-07 19:52:06.0000003          7.5   
1    27835199    2009-07-17 20:04:56.0000002          7.7   
2    44984355   2009-08-24 21:45:00.00000061         12.9   
3    25894730    2009-06-26 08:22:21.0000001          5.3   
4    17610152  2014-08-28 17:47:00.000000188         16.0   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.999512         40.723217                1  
1         -73.994710         40.750325                1  
2         -73.962565         40.772647      

In [12]:
unnamed_cols = ['Unnamed: 0', 'key']
df.drop(columns=[col for col in unnamed_cols if col in df.columns], inplace=True)

display(df.head())

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [13]:
def clean_uber_data(data):
    """
    Clean the Uber trip data and validate coordinates.

    Parameters:
        data (DataFrame): The Uber trip data to clean.

    Returns:
        DataFrame: The cleaned Uber trip data.
    """
    
    if 'pickup_datetime' in data.columns:
        data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'], errors='coerce')
        
    data.drop_duplicates(inplace=True)

    data.dropna(subset=['fare_amount'], inplace=True) 
    if 'passenger_count' in data.columns:
        data['passenger_count'] = data['passenger_count'].fillna(0) 

    if 'fare_amount' in data.columns:
        data['fare_amount'] = data['fare_amount'].astype(float)

    data = data[
        (data['pickup_latitude'] >= -90) & (data['pickup_latitude'] <= 90) &
        (data['pickup_longitude'] >= -180) & (data['pickup_longitude'] <= 180) &
        (data['dropoff_latitude'] >= -90) & (data['dropoff_latitude'] <= 90) &
        (data['dropoff_longitude'] >= -180) & (data['dropoff_longitude'] <= 180)
    ]




    return data

In [14]:

cleaned_uber_data = clean_uber_data(df)

print("Cleaned Data Overview:")
print(cleaned_uber_data.head())

Cleaned Data Overview:
   fare_amount           pickup_datetime  pickup_longitude  pickup_latitude  \
0          7.5 2015-05-07 19:52:06+00:00        -73.999817        40.738354   
1          7.7 2009-07-17 20:04:56+00:00        -73.994355        40.728225   
2         12.9 2009-08-24 21:45:00+00:00        -74.005043        40.740770   
3          5.3 2009-06-26 08:22:21+00:00        -73.976124        40.790844   
4         16.0 2014-08-28 17:47:00+00:00        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.999512         40.723217                1  
1         -73.994710         40.750325                1  
2         -73.962565         40.772647                1  
3         -73.965316         40.803349                3  
4         -73.973082         40.761247                5  


In [15]:
def transform_uber_data(data):
    """
    Transform the Uber trip data by creating new features.

    Parameters:
        data (DataFrame): The Uber trip data to transform.

    Returns:
        DataFrame: The transformed Uber trip data.
    """
    
    data = data.copy()

    # ONLY HOUR INDICATOR 
    if 'pickup_datetime' in data.columns:
        data.loc[:, 'pickup_hour'] = data['pickup_datetime'].dt.hour

    # DAY OF WEEK INDICATOR (mon= 0, sun= 6)
    if 'pickup_datetime' in data.columns:
        data.loc[:, 'pickup_day_of_week'] = data['pickup_datetime'].dt.dayofweek

    # SEASON INDICATOR
    if 'pickup_datetime' in data.columns:
        data.loc[:, 'season'] = np.where(data['pickup_datetime'].dt.month.isin([12, 1, 2]), 'Winter',
                                      np.where(data['pickup_datetime'].dt.month.isin([3, 4, 5]), 'Spring',
                                      np.where(data['pickup_datetime'].dt.month.isin([6, 7, 8]), 'Summer', 'Fall')))
        
#    MUST HAVE DROP OFF COORDS TO CALC
    if 'pickup_latitude' in data.columns and 'pickup_longitude' in data.columns:
        if 'dropoff_latitude' in data.columns and 'dropoff_longitude' in data.columns:
            km_to_miles = 0.621371
            data.loc[:, 'trip_distance_miles'] = data.apply(
                lambda row: great_circle(
                    (row['pickup_latitude'], row['pickup_longitude']),
                    (row['dropoff_latitude'], row['dropoff_longitude'])
                ).kilometers * km_to_miles,
                axis=1
    )

    return data

In [16]:
transformed_uber_data = transform_uber_data(cleaned_uber_data)

print("Transformed Data Overview:")
print(transformed_uber_data.head())

Transformed Data Overview:
   fare_amount           pickup_datetime  pickup_longitude  pickup_latitude  \
0          7.5 2015-05-07 19:52:06+00:00        -73.999817        40.738354   
1          7.7 2009-07-17 20:04:56+00:00        -73.994355        40.728225   
2         12.9 2009-08-24 21:45:00+00:00        -74.005043        40.740770   
3          5.3 2009-06-26 08:22:21+00:00        -73.976124        40.790844   
4         16.0 2014-08-28 17:47:00+00:00        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  pickup_hour  \
0         -73.999512         40.723217                1           19   
1         -73.994710         40.750325                1           20   
2         -73.962565         40.772647                1           21   
3         -73.965316         40.803349                3            8   
4         -73.973082         40.761247                5           17   

   pickup_day_of_week  season  trip_distance_miles  
0           

In [17]:
transformed_data_path = '../data/processed/uber_data_transformed.csv'
transformed_uber_data.to_csv(transformed_data_path, index=False) 

print(f"Transformed Uber data saved to {transformed_data_path}.")

Transformed Uber data saved to ../data/processed/uber_data_transformed.csv.


In [18]:
verified_data = pd.read_csv(transformed_data_path)

print("Verified Transformed Data Overview:")
print(verified_data.head())
# IF LOADS MESSAGE ABOVE BE HAPPY 
#     ∧＿∧
# 　 (｡･ω･｡)つ━☆・*。
#  ⊂/　  /　   ・゜
# 　しーＪ　　　  °。+ * 。　
# 　　　　　         .・゜
# 　　　　　          ゜｡ﾟﾟ･｡･ﾟﾟ。
# 　　　　           　ﾟ。    ｡ﾟ
#                     　ﾟ･｡･ﾟ       


Verified Transformed Data Overview:
   fare_amount            pickup_datetime  pickup_longitude  pickup_latitude  \
0          7.5  2015-05-07 19:52:06+00:00        -73.999817        40.738354   
1          7.7  2009-07-17 20:04:56+00:00        -73.994355        40.728225   
2         12.9  2009-08-24 21:45:00+00:00        -74.005043        40.740770   
3          5.3  2009-06-26 08:22:21+00:00        -73.976124        40.790844   
4         16.0  2014-08-28 17:47:00+00:00        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  pickup_hour  \
0         -73.999512         40.723217                1           19   
1         -73.994710         40.750325                1           20   
2         -73.962565         40.772647                1           21   
3         -73.965316         40.803349                3            8   
4         -73.973082         40.761247                5           17   

   pickup_day_of_week  season  trip_distance_miles