In [None]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle 

In [None]:
df = pd.read_csv('../data/raw/uber.csv') 

print("Original Data Overview:")
print(df.head())

In [None]:
# unnamed_cols = ['Unnamed: 0', 'key']
# df.drop(columns=[col for col in unnamed_cols if col in df.columns], inplace=True)

display(df.head())

In [None]:
def clean_uber_data(data):
    """
    Clean the Uber trip data and validate coordinates.

    Parameters:
        data (DataFrame): The Uber trip data to clean.

    Returns:
        DataFrame: The cleaned Uber trip data.
    """
    
    if 'pickup_datetime' in data.columns:
        data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'], errors='coerce')
        
    data.drop_duplicates(inplace=True)

    data.dropna(subset=['fare_amount'], inplace=True) 
    if 'passenger_count' in data.columns:
        data['passenger_count'] = data['passenger_count'].fillna(0) 

    if 'fare_amount' in data.columns:
        data['fare_amount'] = data['fare_amount'].astype(float)

    data = data[
        (data['pickup_latitude'] >= -90) & (data['pickup_latitude'] <= 90) &
        (data['pickup_longitude'] >= -180) & (data['pickup_longitude'] <= 180) &
        (data['dropoff_latitude'] >= -90) & (data['dropoff_latitude'] <= 90) &
        (data['dropoff_longitude'] >= -180) & (data['dropoff_longitude'] <= 180)
    ]




    return data

In [None]:

cleaned_uber_data = clean_uber_data(df)

print("Cleaned Data Overview:")
print(cleaned_uber_data.head())

In [None]:
def transform_uber_data(data):
    """
    Transform the Uber trip data by creating new features.

    Parameters:
        data (DataFrame): The Uber trip data to transform.

    Returns:
        DataFrame: The transformed Uber trip data.
    """
    
    data = data.copy()

    # ONLY HOUR INDICATOR 
    if 'pickup_datetime' in data.columns:
        data.loc[:, 'pickup_hour'] = data['pickup_datetime'].dt.hour

    # DAY OF WEEK INDICATOR (mon= 0, sun= 6)
    if 'pickup_datetime' in data.columns:
        data.loc[:, 'pickup_day_of_week'] = data['pickup_datetime'].dt.dayofweek

    # SEASON INDICATOR
    if 'pickup_datetime' in data.columns:
        data.loc[:, 'season'] = np.where(data['pickup_datetime'].dt.month.isin([12, 1, 2]), 'Winter',
                                      np.where(data['pickup_datetime'].dt.month.isin([3, 4, 5]), 'Spring',
                                      np.where(data['pickup_datetime'].dt.month.isin([6, 7, 8]), 'Summer', 'Fall')))
        
#    MUST HAVE DROP OFF COORDS TO CALC
    if 'pickup_latitude' in data.columns and 'pickup_longitude' in data.columns:
        if 'dropoff_latitude' in data.columns and 'dropoff_longitude' in data.columns:
            km_to_miles = 0.621371
            data.loc[:, 'trip_distance_miles'] = data.apply(
                lambda row: great_circle(
                    (row['pickup_latitude'], row['pickup_longitude']),
                    (row['dropoff_latitude'], row['dropoff_longitude'])
                ).kilometers * km_to_miles,
                axis=1
    )

    return data

In [None]:
transformed_uber_data = transform_uber_data(cleaned_uber_data)

print("Transformed Data Overview:")
print(transformed_uber_data.head())

In [None]:
transformed_data_path = '../data/processed/uber_data_transformed.csv'
transformed_uber_data.to_csv(transformed_data_path, index=False) 

print(f"Transformed Uber data saved to {transformed_data_path}.")

In [None]:
verified_data = pd.read_csv(transformed_data_path)

print("Verified Transformed Data Overview:")
print(verified_data.head())
# IF LOADS MESSAGE ABOVE BE HAPPY 
#     ∧＿∧
# 　 (｡･ω･｡)つ━☆・*。
#  ⊂/　  /　   ・゜
# 　しーＪ　　　  °。+ * 。　
# 　　　　　         .・゜
# 　　　　　          ゜｡ﾟﾟ･｡･ﾟﾟ。
# 　　　　           　ﾟ。    ｡ﾟ
#                     　ﾟ･｡･ﾟ       
