In [36]:
import pandas as pd 

In [37]:
def save_interim(df, name, format="parquet"):
    path = f"/Users/emiliodulay/Documents/1. UCLA/MATH 156/data/{name}.{format}"
    if format == "parquet":
        df.to_parquet(path, engine="fastparquet")
    elif format == "pickle":
        df.to_pickle(path)
    print(f"Saved interim data to {path}")


# Load raw data

In [38]:
train_df = pd.read_csv('/Users/emiliodulay/Documents/1. UCLA/MATH 156/train.csv')
test_df = pd.read_csv('/Users/emiliodulay/Documents/1. UCLA/MATH 156/test.csv')

In [39]:
train_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [40]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  625134 non-null  object 
 1   vendor_id           625134 non-null  int64  
 2   pickup_datetime     625134 non-null  object 
 3   passenger_count     625134 non-null  int64  
 4   pickup_longitude    625134 non-null  float64
 5   pickup_latitude     625134 non-null  float64
 6   dropoff_longitude   625134 non-null  float64
 7   dropoff_latitude    625134 non-null  float64
 8   store_and_fwd_flag  625134 non-null  object 
dtypes: float64(4), int64(2), object(3)
memory usage: 42.9+ MB


# Light Cleaning 

### Clean for Parquet format

In [41]:
df_items = {
    'train': train_df,
    'test': test_df
}

for name, df in df_items.items(): 
    # convert cols to dt
    for col in ['pickup_datetime', 'dropoff_datetime']: 
        if col in df.columns: 
            df[col] = pd.to_datetime(df[col])

    # convert cols to str 
    for col in ['id', 'store_and_fwd_flag']:
        if col in df.columns: 
            df[col] = df[col].astype(str)

    # light cleaning
    for col in ['trip_duration', 'passenger_count']:
        if col in df.columns: 
            df[df[col] > 0]

    # save as interim
    save_interim(df, f"{name}_interim")

# update dict 
train_df = df_items['train']
test_df = df_items['test']

Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/train_interim.parquet
Saved interim data to /Users/emiliodulay/Documents/1. UCLA/MATH 156/data/test_interim.parquet


In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   id                  1458644 non-null  object        
 1   vendor_id           1458644 non-null  int64         
 2   pickup_datetime     1458644 non-null  datetime64[ns]
 3   dropoff_datetime    1458644 non-null  datetime64[ns]
 4   passenger_count     1458644 non-null  int64         
 5   pickup_longitude    1458644 non-null  float64       
 6   pickup_latitude     1458644 non-null  float64       
 7   dropoff_longitude   1458644 non-null  float64       
 8   dropoff_latitude    1458644 non-null  float64       
 9   store_and_fwd_flag  1458644 non-null  object        
 10  trip_duration       1458644 non-null  int64         
dtypes: datetime64[ns](2), float64(4), int64(3), object(2)
memory usage: 122.4+ MB


In [43]:
train_df.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
dtype: object