pip install pyarrow fastparquet

In [1]:
import pyarrow
import fastparquet
import pandas as pd
# Set Pandas options to always display floats with a decimal point
# (not scientific notation)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.width', 1000)

In [2]:
def perform_EDA(df : pd.DataFrame, filename : str):
    """
    perform_EDA(df : pd.DataFrame, filename : str)
    Accepts a dataframe and a text filename as inputs.
    Runs some basic statistics on the data and outputs to console.

    :param df: The Pandas dataframe to explore
    :param filename: The name of the data file
    :return:
    """
    print(f"{filename}\nNumber of records:")
    print(df.count(),'\n')
    print(f"Number of duplicate records: { len(df)-len(df.drop_duplicates())}\n" )
    print(f"Info")
    print(df.info(),'\n')
    #print(f"{filename} Describe")
    #print(df.describe())
    print(f"Columns with null values")
    print(df.columns[df.isnull().any()].tolist(),'\n')
    rows_with_null_values = df.isnull().any(axis=1).sum()
    print(f"Number of Rows with null values: {rows_with_null_values}\n" )
    integer_column_list = df.select_dtypes(include='int64').columns
    print(f"Integer data type columns: {integer_column_list}\n")
    float_column_list = df.select_dtypes(include='float64').columns
    print(f"Float data type columns: {float_column_list}\n")
    print(f"Finished {filename}")

In [3]:
# Read a Parquet file
parquet_file_name = "landing_2017_2017_April.parquet"
trips_df = pd.read_parquet(parquet_file_name, engine='pyarrow')

In [4]:
trips_df.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,10047135.0,10047135,10047135,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0,10047135.0
mean,1.54,2017-04-16 03:47:47.704476416,2017-04-16 04:04:18.095126272,1.63,2.96,1.05,162.63,160.45,1.34,13.08,0.33,0.5,1.82,0.32,0.3,16.35
min,1.0,2017-04-01 00:00:00,2017-04-01 00:01:36,0.0,0.0,1.0,1.0,1.0,1.0,-390.0,-40.73,-0.5,-391.0,-14.0,-0.3,-398.3
25%,1.0,2017-04-08 10:20:44,2017-04-08 10:34:32,1.0,1.0,1.0,114.0,107.0,1.0,6.5,0.0,0.5,0.0,0.0,0.3,8.5
50%,2.0,2017-04-16 01:46:50,2017-04-16 02:01:36,1.0,1.66,1.0,162.0,162.0,1.0,9.5,0.0,0.5,1.35,0.0,0.3,11.8
75%,2.0,2017-04-23 15:49:56,2017-04-23 16:08:01.500000,2.0,3.1,1.0,233.0,233.0,2.0,14.5,0.5,0.5,2.4,0.0,0.3,17.8
max,2.0,2017-04-30 23:59:59,2017-05-01 23:46:04,9.0,308.02,99.0,265.0,265.0,5.0,538481.03,20.2,41.53,450.0,921.0,1.0,538482.68
std,0.5,,,1.26,3.76,1.06,66.86,70.84,0.5,181.47,0.45,0.04,2.61,1.87,0.01,181.66


In [5]:
trips_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2017-04-01 00:51:24,2017-04-01 00:51:49,1,0.0,1,N,145,145,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,,
1,1,2017-04-01 00:41:17,2017-04-01 00:55:36,1,3.4,1,N,249,87,1,13.0,0.5,0.5,2.85,0.0,0.3,17.15,,
2,1,2017-04-01 00:23:31,2017-04-01 00:35:17,1,2.5,1,N,163,263,1,10.5,0.5,0.5,2.35,0.0,0.3,14.15,,
3,1,2017-04-01 00:05:31,2017-04-01 00:35:30,1,4.5,1,N,163,7,1,20.5,0.5,0.5,4.35,0.0,0.3,26.15,,
4,1,2017-04-01 00:38:13,2017-04-01 00:54:48,2,4.9,1,N,7,262,1,16.5,0.5,0.5,3.55,0.0,0.3,21.35,,


In [6]:
perform_EDA(trips_df,parquet_file_name)

landing_2017_2017_April.parquet
Number of records:
VendorID                 10047135
tpep_pickup_datetime     10047135
tpep_dropoff_datetime    10047135
passenger_count          10047135
trip_distance            10047135
RatecodeID               10047135
store_and_fwd_flag       10047135
PULocationID             10047135
DOLocationID             10047135
payment_type             10047135
fare_amount              10047135
extra                    10047135
mta_tax                  10047135
tip_amount               10047135
tolls_amount             10047135
improvement_surcharge    10047135
total_amount             10047135
congestion_surcharge            0
airport_fee                     0
dtype: int64 

Number of duplicate records: 2

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10047135 entries, 0 to 10047134
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tp

In [7]:
for title in trips_df:
    print(f'{title} : {trips_df[title].isnull().values.any()} : {trips_df[title].isnull().sum()}')

VendorID : False : 0
tpep_pickup_datetime : False : 0
tpep_dropoff_datetime : False : 0
passenger_count : False : 0
trip_distance : False : 0
RatecodeID : False : 0
store_and_fwd_flag : False : 0
PULocationID : False : 0
DOLocationID : False : 0
payment_type : False : 0
fare_amount : False : 0
extra : False : 0
mta_tax : False : 0
tip_amount : False : 0
tolls_amount : False : 0
improvement_surcharge : False : 0
total_amount : False : 0
congestion_surcharge : True : 10047135
airport_fee : True : 10047135


In [9]:
trips_df['VendorID'].value_counts()# trying to figure out how many records each type of vendor has

VendorID
2    5458431
1    4588704
Name: count, dtype: int64

#2009 vs 2023 colum names (removed location colums)
2009['vendor_name', 'Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime', 'Passenger_Count', 'Trip_Distance', 'Rate_Code', 'store_and_forward', 'Payment_Type', 'Fare_Amt', 'surcharge', 'mta_tax', 'Tip_Amt', 'Tolls_Amt', 'Total_Amt']
2023['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee']