In [None]:
USE AirlinesData
GO

In [None]:
CREATE EXTERNAL TABLE [csv].[flightdelays] (
    [Year] int,
    [Month] int,
    [DayofMonth] int,
    [DepTime] float,
    [CRSDepTime] int,
    [ArrTime] float,
    [CRSArrTime] int,
    [UniqueCarrier] varchar(2),
    [FlightNum] int,
    [TailNum] varchar(10),
    [ActualElapsedTime] float,
    [CRSElapsedTime] float,
    [Origin] varchar(5),
    [Dest] varchar(5),
    [Cancelled] int,
    [CancellationCode] varchar(5)
) WITH (LOCATION = N'/FlightDelays/csv/flightdelays.csv', DATA_SOURCE = [SqlStoragePool], FILE_FORMAT = [FileFormat_csv]);

In [None]:
DROP EXTERNAL TABLE [csv].[flightdelays] 

In [None]:
SELECT TOP (100) * FROM [csv].[flightdelays]

Copy this table into Parquet files

In [None]:
CREATE SCHEMA [parquet];
GO

In [None]:
CREATE EXTERNAL FILE FORMAT [FileFormat_parquet]  
WITH (FORMAT_TYPE = PARQUET);

In [None]:
CREATE EXTERNAL TABLE [parquet].[flightdelays] (
    [Year] int,
    [Month] int,
    [DayofMonth] int,
    [DepTime] float,
    [CRSDepTime] int,
    [ArrTime] float,
    [CRSArrTime] int,
    [UniqueCarrier] varchar(2),
    [FlightNum] int,
    [TailNum] varchar(10),
    [ActualElapsedTime] float,
    [CRSElapsedTime] float,
    [Origin] varchar(5),
    [Dest] varchar(5),
    [Cancelled] int,
    [CancellationCode] varchar(5)
) WITH (LOCATION = N'/FlightDelays/parquet', DATA_SOURCE = [SqlStoragePool], FILE_FORMAT = [FileFormat_parquet]);

In [None]:
SELECT TOP (100) * FROM [parquet].[flightdelays]

Copy this data into the Data Pool

In [None]:
Use AirlinesData
GO
IF NOT EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlDataPool')
        CREATE EXTERNAL DATA SOURCE SqlDataPool
        WITH (LOCATION = 'sqldatapool://controller-svc/default');

In [None]:
CREATE SCHEMA datapool

In [None]:
CREATE EXTERNAL TABLE [datapool].[FlightDelays] (
    [Year] int,
    [Month] int,
    [DayofMonth] int,
    [DepTime] float,
    [CRSDepTime] int,
    [ArrTime] float,
    [CRSArrTime] int,
    [UniqueCarrier] varchar(2),
    [FlightNum] int,
    [TailNum] varchar(10),
    [ActualElapsedTime] float,
    [CRSElapsedTime] float,
    [Origin] varchar(5),
    [Dest] varchar(5),
    [Cancelled] int,
    [CancellationCode] varchar(5)
) WITH (
    DATA_SOURCE = [SqlDataPool], 
    DISTRIBUTION = ROUND_ROBIN
);

In [None]:
INSERT INTO [datapool].[FlightDelays]
SELECT * FROM [csv].[FlightDelays];

In [None]:
SELECT COUNT(*) AS n
FROM [datapool].[FlightDelays];

In [None]:
EXEC ('USE [AirlinesData]; SELECT count(*) FROM datapool.FlightDelays') AT Data_Source SqlDataPool;

Quick Performance Comparison

In [None]:
SELECT COUNT(*) FROM [csv].[FlightDelays];

In [None]:
SELECT COUNT(*) FROM [parquet].[FlightDelays];

In [None]:
SELECT COUNT(*) FROM [datapool].[FlightDelays];

Join between external tables

In [None]:
SELECT TOP (20) pfd.[Year], pfd.[Month], pfd.[DayofMonth], pfd.[UniqueCarrier],
    a.[Name] AS AirlineName, pfd.[Origin], pfd.[Dest]
FROM [parquet].[FlightDelays] pfd
INNER JOIN [csv].[airlines] a ON pfd.UniqueCarrier = a.IATA AND a.IATA = 'F9';