# Notebook Summary 

This is a databricks sample notebook demonstrating how to use the ETIQ library to run data analyses on a spark dataset. Please note that this is just an example on a large scale dataset which has not been cleaned.

### Quickstart

  1. Install and import etiq library with the spark extension 

  2. Login to the dashboard - this way you can send the results to your dashboard instance (Etiq AWS instance if you use the SaaS version). To deploy on your own cloud instance, get in touch (info@etiq.ai)

  3. Create or open a project 
  
### Data Issues


  4. Load the New York Yellow Taxi Trips data
  
  5. Scan for data isssues. In this case we limit our scan to ordering issues i.e. where the pickup time is recorded as occurring after the drop off time.

In [None]:
# Install the spark extension for etiq. This will install the etiq base package as a dependency
%pip install etiq-spark

In [None]:
# Import the spark extensions for etiq
import etiq.spark
import datetime

In [None]:
# Login to the etiq dashboard
from etiq import login as etiq_login
etiq_login("https://dashboard.etiq.ai/", "<your-key>")

In [None]:
# Create a project
project = etiq.projects.open(name=f"NYC Yellow Taxi Trips ({datetime.datetime.now().date()})")


## Load the NY Yellow Taxi Trips Data

### Load the spark data

In [None]:
from pyspark.sql.functions import lit, year

# Load ny yellow taxi trips data into a spark dataframe
yellow_taxi_trips = spark.read.load("dbfs:/databricks-datasets/nyctaxi/tables/nyctaxi_yellow")
yellow_taxi_trips.show()

# Get data for trips in 2018 and 2019
yellow_taxi_trips_2012 = yellow_taxi_trips.filter(year('pickup_datetime') == lit(2012))
yellow_taxi_trips_2014 = yellow_taxi_trips.filter(year('pickup_datetime') == lit(2014))

### Create ETIQ datasets

In [None]:
# Create etiq dataset from all the data
yellow_taxi_trips_dataset = etiq.spark.SimpleSparkDatasetBuilder.datasets(validation_features=yellow_taxi_trips,
                                                                          label='tip_amount',
                                                                          cat_col = ['payment_type', 'rate_code_id', 'store_and_fwd_flag', 'vendor_id'],
                                                                          date_col = ['dropoff_datetime', 'pickup_datetime'],
                                                                          name='NY Yellow Tax Trips')
# Create an etiq dataset for the data from 2012
yellow_taxi_trips_dataset_2012 = etiq.spark.SimpleSparkDatasetBuilder.datasets(validation_features=yellow_taxi_trips_2012,
                                                                          label='tip_amount',
                                                                          cat_col = ['payment_type', 'rate_code_id', 'store_and_fwd_flag', 'vendor_id'],
                                                                          date_col = ['dropoff_datetime', 'pickup_datetime'],
                                                                          name='NY Yellow Tax Trips (2012)')
# Create an etiq dataset for the data from 2014                                                                          
yellow_taxi_trips_dataset_2014 = etiq.spark.SimpleSparkDatasetBuilder.datasets(validation_features=yellow_taxi_trips_2014,
                                                                          label='tip_amount',
                                                                          cat_col = ['payment_type', 'rate_code_id', 'store_and_fwd_flag', 'vendor_id'],
                                                                          date_col = ['dropoff_datetime', 'pickup_datetime'],
                                                                          name='NY Yellow Tax Trips (2014)')   

### Create Snapshots

In [None]:
# Create snapshot containing all the data
snapshot = project.snapshots.create(name="Data Issues",
                                    dataset=yellow_taxi_trips_dataset,
                                    model=None)
# Create a snapshot to compare data from 2012 and 2014
snapshot_2012_2014 = project.snapshots.create(name="Data Issues 2012/2014",
                                    dataset=yellow_taxi_trips_dataset_2012,
                                    comparison_dataset=yellow_taxi_trips_dataset_2014,
                                    model=None)

## Scan Snapshots for Issues

### Scan for Data Issues

In [None]:
# Scan the snapshot for data issues.
# We limit these to only issues where pickup_datetime is recorded as occuring after dropoff_datetme
(segments, issues, issue_summary) = snapshot.scan_data_issues(orderings=[('pickup_datetime', 'dropoff_datetime')], 
                                                              filter_ids=[], 
                                                              duplicate_features_subset=[])

### Scan for feature drift

In [None]:
# Scan for drift on payment type, fare amount and trip distance 
# between 2012 and 2014 using the psi, jensen-shannon and kolmogorov 
# smirnov metrics
(drift_segments, drift_issues, drift_issue_summary)  = (
    snapshot_2012_2014.scan_drift_metrics(features=["payment_type", "fare_amount", "trip_distance"], 
                                          drift_measures=["psi", "jensen_shannon", "kolmogorov_smirnov"],
                                          thresholds={"psi": [0.0, 0.15],
                                                      "kolmogorov_smirnov": [0.05, 1.0],
                                                      "jensen_shannon": [0.0, 0.05]},
                                          ignore_lower_threshold=False,
                                          ignore_upper_threshold=False)
)