### Settings

In [1]:
import pandas as pd
from pyspark.sql import SparkSession

from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("tutorial").getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001C17AF52320>


#### Reading JSON

In [3]:
df_json = spark.read.format('json') \
                .option('inferSchema', True) \
                .option('header', True) \
                .load('data-Drivers.json')

In [4]:
df_json.show(5)

+----+----------+--------+----------+--------------------+-----------+------+--------------------+
|code|       dob|driverId| driverRef|                name|nationality|number|                 url|
+----+----------+--------+----------+--------------------+-----------+------+--------------------+
| HAM|1985-01-07|       1|  hamilton|   {Lewis, Hamilton}|    British|    44|http://en.wikiped...|
| HEI|1977-05-10|       2|  heidfeld|    {Nick, Heidfeld}|     German|    \N|http://en.wikiped...|
| ROS|1985-06-27|       3|   rosberg|     {Nico, Rosberg}|     German|     6|http://en.wikiped...|
| ALO|1981-07-29|       4|    alonso|  {Fernando, Alonso}|    Spanish|    14|http://en.wikiped...|
| KOV|1981-10-19|       5|kovalainen|{Heikki, Kovalainen}|    Finnish|    \N|http://en.wikiped...|
+----+----------+--------+----------+--------------------+-----------+------+--------------------+
only showing top 5 rows



#### Reading CSV

In [5]:
df = (spark.read.csv("data-BigMart.csv", header=True, inferSchema=True))

In [6]:
df.show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

In [7]:
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



##### DDL Schema

In [8]:
ddl_schema = '''
                Item_Identifier STRING,
                Item_Weight STRING,
                Item_Fat_Content STRING,
                Item_Visibility DOUBLE,
                Item_Type STRING,
                Item_MRP DOUBLE,
                Outlet_Identifier STRING,
                Outlet_Establishment_Year INT,
                Outlet_Size STRING,
                Outlet_Location_Type STRING,
                Outlet_Type STRING,
                Item_Outlet_Sales DOUBLE
'''

In [9]:
df2 = spark.read.format('csv') \
                .schema(ddl_schema) \
                .option('header', True) \
                .load('data-BigMart.csv')

In [10]:
df2.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



##### StructType() Schema

In [11]:
struct_schema = StructType([
    StructField('Item_Identifier', StringType(), True),
    StructField('Item_Weight', StringType(), True),
    StructField('Item_Fat_Content', StringType(), True),
    StructField('Item_Visibility', StringType(), True),
    StructField('Item_Type', StringType(), True),
    StructField('Item_MRP', StringType(), True),
    StructField('Outlet_Identifier', StringType(), True),
    StructField('Outlet_Establishment_Year', StringType(), True),
    StructField('Outlet_Size', StringType(), True),
    StructField('Outlet_Location_Type', StringType(), True),
    StructField('Outlet_Type', StringType(), True),
    StructField('Item_Outlet_Sales', StringType(), True)
])

In [12]:
df3 = spark.read.format('csv') \
                .schema(struct_schema) \
                .option('header', True) \
                .load('data-BigMart.csv')

In [13]:
df3.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: string (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: string (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: string (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: string (nullable = true)



### Data Transformation - Level 1

#### Select

In [14]:
df.select('Item_Identifier', 'Item_Weight', 'Item_Fat_Content').show(5)

+---------------+-----------+----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|
+---------------+-----------+----------------+
|          FDA15|        9.3|         Low Fat|
|          DRC01|       5.92|         Regular|
|          FDN15|       17.5|         Low Fat|
|          FDX07|       19.2|         Regular|
|          NCD19|       8.93|         Low Fat|
+---------------+-----------+----------------+
only showing top 5 rows



In [15]:
df.select(col('Item_Identifier'), col('Item_Weight'), col('Item_Fat_Content')).show(5)

+---------------+-----------+----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|
+---------------+-----------+----------------+
|          FDA15|        9.3|         Low Fat|
|          DRC01|       5.92|         Regular|
|          FDN15|       17.5|         Low Fat|
|          FDX07|       19.2|         Regular|
|          NCD19|       8.93|         Low Fat|
+---------------+-----------+----------------+
only showing top 5 rows



#### Alias

In [16]:
df.select(col('Item_Identifier').alias('Item_ID')).show(5)

+-------+
|Item_ID|
+-------+
|  FDA15|
|  DRC01|
|  FDN15|
|  FDX07|
|  NCD19|
+-------+
only showing top 5 rows



#### Filter / Where

##### 1. Filter the data with fat content = Regular

In [17]:
df.where(col('Item_Fat_Content') == 'Regular').show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          FDX07|       19.2|         Regular|            0.0|Fruits and Vegeta...| 182.095|           OUT010|                     1998|       NULL|              Tier 3|    Gro

In [18]:
df.filter(col('Item_Fat_Content') == 'Regular').toPandas().sample(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
2266,FDJ58,15.6,Regular,0.176244,Snack Foods,173.6764,OUT010,1998,,Tier 3,Grocery Store,515.3292
1103,FDL12,,Regular,0.212963,Baking Goods,59.522,OUT019,1985,Small,Tier 1,Grocery Store,299.61
1125,FDE11,17.7,Regular,0.0,Starchy Foods,183.5924,OUT035,2004,Small,Tier 2,Supermarket Type1,7033.5112
1685,FDC59,16.7,Regular,0.0,Starchy Foods,63.6168,OUT049,1999,Medium,Tier 1,Supermarket Type1,1342.2528
203,FDE05,,Regular,0.0,Frozen Foods,145.2102,OUT019,1985,Small,Tier 1,Grocery Store,437.4306


##### 2. Slice the data with item type = Soft Drinks and weight < 10

In [19]:
df.filter( (col('Item_Type') == 'Soft Drinks') & (col('Item_Weight') < 10) ).show(5)

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          DRC01|       5.92|         Regular|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          DRZ11|       8.85|         Regular|    0.113123893|Soft Drinks|122.5388|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|        1609.9044|
|          DRF4

##### 3. Fetch the data with Tier in (1 or 2) and Outlet Size is Null

In [20]:
df.filter( (col('Outlet_Size').isNull()) & (col('Outlet_Location_Type').isin('Tier 1', 'Tier 2')) ).show(5)

+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|         Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDH17|       16.2|         Regular|    0.016687114|      Frozen Foods| 96.9726|           OUT045|                     2002|       NULL|              Tier 2|Supermarket Type1|        1076.5986|
|          FDU28|       19.2|         Regular|     0.09444959|      Frozen Foods|187.8214|           OUT017|                     2007|       NULL|              Tier 2|Supermarket Type1

#### withColumnRenamed / withColumn

In [21]:
df.withColumnRenamed('Item_Weight', 'Item_Wt').show(2)

+---------------+-------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Wt|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|    9.3|         Low Fat|    0.016047301|      Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|   5.92|         Regular|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
+---------------+-------+----------

In [22]:
df.withColumnsRenamed({'Item_Identifier': 'Item_Id', 'Item_Weight': 'Item_Wt'}).show(2)

+-------+-------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Id|Item_Wt|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+-------+-------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|  FDA15|    9.3|         Low Fat|    0.016047301|      Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|  DRC01|   5.92|         Regular|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
+-------+-------+----------------+---------------+-----------+--------+----

In [23]:
df.withColumn('flag', lit("new")).show(2)

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|flag|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----+
|          FDA15|        9.3|         Low Fat|    0.016047301|      Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138| new|
|          DRC01|       5.92|         Regular|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4

In [24]:
df.withColumn('multiply', col('Item_Weight') * col('Item_MRP')).show(2)

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+------------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|          multiply|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+------------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|      Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2323.2255600000003|
|          DRC01|       5.92|         Regular|    0.019278216|Soft Drinks| 48.2692|           OUT018|                     2009|     Medi

In [25]:
(
    df.withColumn('Item_Fat_Content', upper(col('Item_Fat_Content')))
    .withColumn('Item_Fat_Content', regexp_replace(col('Item_Fat_Content'), 'REGULAR', 'REG'))
    .withColumn('Item_Fat_Content', regexp_replace(col('Item_Fat_Content'), 'LOW FAT', 'LF'))
    #.select('Item_Fat_Content').distinct() \
    .show(5)
)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|              LF|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|             REG|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

#### Type Casting

In [26]:
df.withColumn('Item_Weight', col('Item_Weight').cast(StringType())).printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



#### Sort

In [27]:
df.sort(col('Item_Weight').desc()).show(10)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDC02|      21.35|         Low Fat|    0.069102831|              Canned|259.9278|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|        6768.5228|
|          FDC02|      21.35|         Low Fat|    0.115194717|              Canned|258.3278|           OUT010|                     1998|       NULL|              Tier 3|    Gro

In [28]:
df.sort(col('Item_Visibility').desc()).show(10)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|  Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-------------+-----------------+
|          FDU13|       NULL|         low fat|    0.328390948|              Canned|146.0418|           OUT019|                     1985|      Small|              Tier 1|Grocery Store|         588.5672|
|          NCZ18|       NULL|         Low Fat|    0.325780807|           Household|252.7698|           OUT019|                     1985|      Small|              Tier 1|Grocery Store|         

In [29]:
df.sort(['Item_Weight', 'Item_Visibility'], ascending=[0, 1]).show(10)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDC02|      21.35|         Low Fat|    0.068765205|              Canned|260.4278|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|        3644.5892|
|          FDC02|      21.35|         Low Fat|    0.068809463|              Canned|258.5278|           OUT035|                     2004|      Small|              Tier 2|Superma

#### Drop

In [30]:
df.drop('Item_Visibility').show(5)

+---------------+-----------+----------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          FDN15|       17.5|         Low Fat|    

In [31]:
df.drop('Item_Visibility', 'Item_Type').show(5)

+---------------+-----------+----------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         443.4228|
|          FDN15|       17.5|         Low Fat| 141.618|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|     

In [32]:
(df
 .dropDuplicates(subset=['Item_Type'])
 #.count()
 .show()
)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDP36|     10.395|         Regular|            0.0|        Baking Goods| 51.4008|           OUT018|                     2009|     Medium|              Tier 3|Supermarket Type2|         556.6088|
|          FDO23|      17.85|         Low Fat|            0.0|              Breads| 93.1436|           OUT045|                     2002|       NULL|              Tier 2|Superma

### Data Transformation - Level 2

#### Union / unionByName

In [33]:
data1 = [
    ('1', 'Shohei', 'LAD'),
    ('2', 'Judge', 'NYY'),
    ('3', 'Adley', 'BAL')
]
schema1 = 'id STRING, name STRING, team STRING'

df1 = spark.createDataFrame(data1, schema1)

data2 = [
    ('4', 'Doncic', 'LAL'),
    ('5', 'Curry', 'GSW'),
    ('6', 'Bryant', 'LAL')
]
schema2 = 'id STRING, name STRING, team STRING'

df2 = spark.createDataFrame(data2, schema2)

In [34]:
df1.union(df2).show()

+---+------+----+
| id|  name|team|
+---+------+----+
|  1|Shohei| LAD|
|  2| Judge| NYY|
|  3| Adley| BAL|
|  4|Doncic| LAL|
|  5| Curry| GSW|
|  6|Bryant| LAL|
+---+------+----+



In [35]:
data1 = [
    ('Shohei', 'LAD', '1'),
    ('Judge', 'NYY', '2'),
    ('Adley', 'BAL', '3')
]
schema1 = 'name STRING, team STRING, id STRING'

df1 = spark.createDataFrame(data1, schema1)

df1.show()

+------+----+---+
|  name|team| id|
+------+----+---+
|Shohei| LAD|  1|
| Judge| NYY|  2|
| Adley| BAL|  3|
+------+----+---+



In [36]:
data2 = [
    ('LAL', '4', 'Doncic'),
    ('GSW', '5', 'Curry'),
    ('LAL', '6', 'Bryant')
]
schema2 = 'team STRING, id STRING, name STRING'

df2 = spark.createDataFrame(data2, schema2)

df2.show()

+----+---+------+
|team| id|  name|
+----+---+------+
| LAL|  4|Doncic|
| GSW|  5| Curry|
| LAL|  6|Bryant|
+----+---+------+



In [37]:
df1.union(df2).show()

+------+----+------+
|  name|team|    id|
+------+----+------+
|Shohei| LAD|     1|
| Judge| NYY|     2|
| Adley| BAL|     3|
|   LAL|   4|Doncic|
|   GSW|   5| Curry|
|   LAL|   6|Bryant|
+------+----+------+



In [38]:
df1.unionByName(df2).show()

+------+----+---+
|  name|team| id|
+------+----+---+
|Shohei| LAD|  1|
| Judge| NYY|  2|
| Adley| BAL|  3|
|Doncic| LAL|  4|
| Curry| GSW|  5|
|Bryant| LAL|  6|
+------+----+---+



#### String Functions

In [39]:
df.select(initcap('Item_Type').alias('item_type')).distinct().show(truncate=False)

+---------------------+
|item_type            |
+---------------------+
|Starchy Foods        |
|Baking Goods         |
|Breads               |
|Meat                 |
|Hard Drinks          |
|Soft Drinks          |
|Household            |
|Breakfast            |
|Dairy                |
|Snack Foods          |
|Others               |
|Fruits And Vegetables|
|Seafood              |
|Health And Hygiene   |
|Canned               |
|Frozen Foods         |
+---------------------+



#### Date Functions

In [40]:
(df
 .withColumn('curr_date', current_date())
 .withColumn('week_after', date_add('curr_date', 7))
 .withColumn('week_before', date_sub('curr_date', 7))
 .withColumn('fortnight_before', date_format(date_add('curr_date', -14), 'dd.MM.yy'))
 .withColumn('ddiff', date_diff('week_after', 'curr_date'))
 .show(5)
)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+----------------+-----+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales| curr_date|week_after|week_before|fortnight_before|ddiff|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+----------+----------+-----------+----------------+-----+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|2025-0

#### Handling Nulls

In [41]:
(
    df.select([
        sum(col(c).isNull().cast("int"))
        .alias(c) 
        for c in df.columns
    ])
    .show()
)

+---------------+-----------+----------------+---------------+---------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|              0|       1463|               0|              0|        0|       0|                0|                        0|       2410|                   0|          0|                0|
+---------------+-----------+----------------+---------------+---------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+



In [42]:
(df
 .dropna(how='any')
 .count()
 #.show()
)

4650

In [43]:
(df
 .dropna(subset=['Item_Weight'])
 .count()
 #.show()
)

7060

In [44]:
(df
 .fillna('Unknown', subset=['Outlet_Size'])
 .na.fill(0, subset=['Item_Weight'])
 .show(5)
)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

#### Split and Indexing / Explode / Array Contains

In [45]:
df.withColumn('Outlet_Type', split('Outlet_Type', ' ')[1]).show(5)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|      Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|      Type2|         443.4228|


In [46]:
(df
 .withColumn('Outlet_Type', split('Outlet_Type', ' '))
 .withColumn('Outlet_Type', explode('Outlet_Type'))
 .show(6)
)

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|      Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket|         3735.138|
|          FDA15|        9.3|         Low Fat|    0.016047301|      Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|      Type1|         3735.138|
|          DRC01|       5.92|         Regular

In [47]:
(df
 .withColumn('Outlet_Type', split('Outlet_Type', ' '))
 .withColumn('Type1_Flag', array_contains('Outlet_Type', 'Type1'))
 .show(5)
)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+----------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|         Outlet_Type|Item_Outlet_Sales|Type1_Flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+--------------------+-----------------+----------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|[Supermarket, Type1]|         3735.138|      true|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|          

#### Group By

In [48]:
(df.groupBy('Item_Type')
 .agg(round(sum('Item_MRP'), 2).alias('Total_MRP'))
 .sort(col('Total_MRP').desc())
 .show(5, truncate=False)
)

+---------------------+---------+
|Item_Type            |Total_MRP|
+---------------------+---------+
|Fruits and Vegetables|178124.08|
|Snack Foods          |175433.92|
|Household            |135976.53|
|Frozen Foods         |118558.88|
|Dairy                |101276.46|
+---------------------+---------+
only showing top 5 rows



In [49]:
(df.fillna('Unknown', subset=['Outlet_Size'])
 .groupBy('Item_Type', 'Outlet_Size')
 .agg(round(sum('Item_MRP'), 2).alias('Total_MRP'))
 .sort(col('Total_MRP').desc())
 .show(10, truncate=False)
)

+---------------------+-----------+---------+
|Item_Type            |Outlet_Size|Total_MRP|
+---------------------+-----------+---------+
|Snack Foods          |Medium     |60701.91 |
|Fruits and Vegetables|Medium     |59047.22 |
|Fruits and Vegetables|Unknown    |49758.73 |
|Fruits and Vegetables|Small      |48646.79 |
|Snack Foods          |Small      |48357.89 |
|Snack Foods          |Unknown    |48143.24 |
|Household            |Medium     |42688.57 |
|Household            |Small      |39569.12 |
|Household            |Unknown    |38567.79 |
|Frozen Foods         |Medium     |38512.62 |
+---------------------+-----------+---------+
only showing top 10 rows



In [50]:
(df.groupBy('Item_Type')
 .agg(
     round(sum('Item_MRP'), 2).alias('Total_MRP'),
     round(avg('Item_MRP'), 2).alias('Average_MRP'),
    )
 .sort(col('Total_MRP').desc())
 .show(5, truncate=False)
)

+---------------------+---------+-----------+
|Item_Type            |Total_MRP|Average_MRP|
+---------------------+---------+-----------+
|Fruits and Vegetables|178124.08|144.58     |
|Snack Foods          |175433.92|146.19     |
|Household            |135976.53|149.42     |
|Frozen Foods         |118558.88|138.5      |
|Dairy                |101276.46|148.5      |
+---------------------+---------+-----------+
only showing top 5 rows



### Data Transformation - Level 3

#### Collect List

In [51]:
data1 = [
    ('Shohei Ohtani', 'MLB'),
    ('Aaron Judge', 'MLB'),
    ('Juan Soto', 'MLB'),
    ('Lamar Jackson', 'NFL'),
    ('Josh Allen', 'NFL'),
    ('Kobe Bryant', 'NBA'),
    ('Luka Doncic', 'NBA')
]
schema1 = 'player STRING, league STRING'

df1 = spark.createDataFrame(data1, schema1)

df1.show()

+-------------+------+
|       player|league|
+-------------+------+
|Shohei Ohtani|   MLB|
|  Aaron Judge|   MLB|
|    Juan Soto|   MLB|
|Lamar Jackson|   NFL|
|   Josh Allen|   NFL|
|  Kobe Bryant|   NBA|
|  Luka Doncic|   NBA|
+-------------+------+



In [52]:
(df1
 .groupBy('league')
 .agg(collect_list('player').alias('players'))
 .show(truncate=False)
)

+------+---------------------------------------+
|league|players                                |
+------+---------------------------------------+
|MLB   |[Shohei Ohtani, Aaron Judge, Juan Soto]|
|NFL   |[Lamar Jackson, Josh Allen]            |
|NBA   |[Kobe Bryant, Luka Doncic]             |
+------+---------------------------------------+



#### Pivot

In [53]:
(df.fillna('Unknown', subset=['Outlet_Size'])
 .groupBy('Item_Type')
 .pivot('Outlet_Size')
 .agg(round(avg('Item_MRP'), 2))
 .show()
)

+--------------------+------+------+------+-------+
|           Item_Type|  High|Medium| Small|Unknown|
+--------------------+------+------+------+-------+
|       Starchy Foods|158.16|148.42|150.27| 140.48|
|        Baking Goods| 129.2|126.18|125.21| 126.67|
|              Breads|133.76|140.86|145.52| 139.05|
|Fruits and Vegeta...|145.57|142.97|148.31| 142.58|
|                Meat|137.24|136.42| 145.7| 139.29|
|         Hard Drinks|141.93|142.84|129.76| 134.39|
|         Soft Drinks|131.76|128.27|132.86| 133.42|
|           Household| 147.1|147.71|153.97| 147.77|
|           Breakfast|147.49|134.54|130.57| 158.68|
|               Dairy|153.51|148.51|145.94| 149.05|
|         Snack Foods|145.85|148.78|144.35| 145.01|
|              Others|132.58|127.84|137.89| 132.59|
|             Seafood|134.86|140.86|144.28| 142.22|
|              Canned|135.44|138.12| 142.3| 140.65|
|        Frozen Foods|136.83|140.56|137.84| 137.49|
|  Health and Hygiene|135.11| 128.7|131.83| 130.56|
+-----------

#### When Otherwise

In [54]:
(
    df
    .withColumn(
        'Veg_Flag',
        when(col('Item_Type') == 'Meat', False)
        .otherwise(True)
    )
    .withColumn(
        'Veg_Exp_Flag',
        when((col('Veg_Flag') == True) & (col('Item_MRP') > 100), 'Veg Inexpensive')
        .when((col('Veg_Flag') == True) & (col('Item_MRP') < 100), 'Veg Expensive')
        .otherwise('Not Available')
    )
    .show(8)
)

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+--------+---------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|Veg_Flag|   Veg_Exp_Flag|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+--------+---------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|    true|Veg Inexpensive|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft 

#### Joins

In [55]:
player_data = [
    ('1', 'Shohei Ohtani', 'l03'),
    ('2', 'Aaron Judge', 'l03'),
    ('3', 'Juan Soto', 'l03'),
    ('4', 'Lamar Jackson', 'l01'),
    ('5', 'Josh Allen', 'l01'),
    ('6', 'Kobe Bryant', 'l02'),
    ('7', 'Luka Doncic', 'l02'),
    ('8', 'Connor McDavid', 'l04'),
    ('9', 'Lamine Yamal', 'f00'),
    ('10', 'Kylian Mbappe', 'f00')
]
player_schema = 'id STRING, player STRING, league_id STRING'

player_df = spark.createDataFrame(player_data, player_schema)

league_data = [
    ('l01', 'NFL'),
    ('l02', 'NBA'),
    ('l03', 'MLB'),
    ('l04', 'NHL'),
    ('l05', 'PL')
]
league_schema = 'id STRING, league STRING'

league_df = spark.createDataFrame(league_data, league_schema)

In [56]:
player_df.show()

+---+--------------+---------+
| id|        player|league_id|
+---+--------------+---------+
|  1| Shohei Ohtani|      l03|
|  2|   Aaron Judge|      l03|
|  3|     Juan Soto|      l03|
|  4| Lamar Jackson|      l01|
|  5|    Josh Allen|      l01|
|  6|   Kobe Bryant|      l02|
|  7|   Luka Doncic|      l02|
|  8|Connor McDavid|      l04|
|  9|  Lamine Yamal|      f00|
| 10| Kylian Mbappe|      f00|
+---+--------------+---------+



In [57]:
league_df.show()

+---+------+
| id|league|
+---+------+
|l01|   NFL|
|l02|   NBA|
|l03|   MLB|
|l04|   NHL|
|l05|    PL|
+---+------+



In [58]:
(player_df
 .join(league_df, player_df['league_id']==league_df['id'])
 .select('player', 'league')
 .show()
)

+--------------+------+
|        player|league|
+--------------+------+
| Lamar Jackson|   NFL|
|    Josh Allen|   NFL|
|   Kobe Bryant|   NBA|
|   Luka Doncic|   NBA|
| Shohei Ohtani|   MLB|
|   Aaron Judge|   MLB|
|     Juan Soto|   MLB|
|Connor McDavid|   NHL|
+--------------+------+



In [59]:
(player_df
 .join(league_df, player_df['league_id']==league_df['id'], 'left')
 .select('player', 'league_id', 'league')
 .show()
)

+--------------+---------+------+
|        player|league_id|league|
+--------------+---------+------+
| Shohei Ohtani|      l03|   MLB|
|   Aaron Judge|      l03|   MLB|
|     Juan Soto|      l03|   MLB|
| Lamar Jackson|      l01|   NFL|
|    Josh Allen|      l01|   NFL|
|   Kobe Bryant|      l02|   NBA|
|   Luka Doncic|      l02|   NBA|
|Connor McDavid|      l04|   NHL|
|  Lamine Yamal|      f00|  NULL|
| Kylian Mbappe|      f00|  NULL|
+--------------+---------+------+



In [60]:
(player_df
 .join(league_df, player_df['league_id']==league_df['id'], 'right')
 .select('player', 'league_id', 'league')
 .show()
)

+--------------+---------+------+
|        player|league_id|league|
+--------------+---------+------+
|    Josh Allen|      l01|   NFL|
| Lamar Jackson|      l01|   NFL|
|   Luka Doncic|      l02|   NBA|
|   Kobe Bryant|      l02|   NBA|
|     Juan Soto|      l03|   MLB|
|   Aaron Judge|      l03|   MLB|
| Shohei Ohtani|      l03|   MLB|
|Connor McDavid|      l04|   NHL|
|          NULL|     NULL|    PL|
+--------------+---------+------+



In [61]:
(player_df
 .join(league_df, player_df['league_id']==league_df['id'], 'anti')
 .show()
)

+---+-------------+---------+
| id|       player|league_id|
+---+-------------+---------+
|  9| Lamine Yamal|      f00|
| 10|Kylian Mbappe|      f00|
+---+-------------+---------+



### Window Functions

In [62]:
from pyspark.sql.window import Window

In [63]:
(df
 .withColumn(
     'Row_Col', 
     row_number().over(Window.orderBy('Item_Identifier'))
    )
 .show(5)
)

+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|  Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|Row_Col|
+---------------+-----------+----------------+---------------+-----------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+-------+
|          DRA12|       11.6|         Low Fat|    0.041177505|Soft Drinks|140.3154|           OUT017|                     2007|       NULL|              Tier 2|Supermarket Type1|        2552.6772|      1|
|          DRA12|       11.6|         Low Fat|    0.040911824|Soft Drinks|142.3154|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|  

In [64]:
# specify window
w = Window.orderBy(desc('Total_Sales_Qty'))

(df
 .groupBy('Outlet_Identifier')
 .agg(round(count('Outlet_Identifier'), 2).alias('Total_Sales_Qty'))
 .withColumn('Rank', rank().over(w))
 .withColumn('Dense_Rank', dense_rank().over(w))
 .show(10)
)

+-----------------+---------------+----+----------+
|Outlet_Identifier|Total_Sales_Qty|Rank|Dense_Rank|
+-----------------+---------------+----+----------+
|           OUT027|            935|   1|         1|
|           OUT013|            932|   2|         2|
|           OUT046|            930|   3|         3|
|           OUT035|            930|   3|         3|
|           OUT049|            930|   3|         3|
|           OUT045|            929|   6|         4|
|           OUT018|            928|   7|         5|
|           OUT017|            926|   8|         6|
|           OUT010|            555|   9|         7|
|           OUT019|            528|  10|         8|
+-----------------+---------------+----+----------+



In [65]:
(df
 .groupBy('Item_Type')
 .agg(round(sum('Item_Outlet_Sales'), 2).alias('Total_Sales'))
 .withColumn(
     'Cum_Sales', 
     sum('Total_Sales').over(Window
                             .orderBy(desc('Total_Sales'))
                             .rowsBetween(Window.unboundedPreceding, Window.currentRow)
                            )
                        .cast(DecimalType(18, 2))
    )
 .show(truncate=False)
)

+---------------------+-----------+-----------+
|Item_Type            |Total_Sales|Cum_Sales  |
+---------------------+-----------+-----------+
|Fruits and Vegetables|2820059.82 |2820059.82 |
|Snack Foods          |2732786.09 |5552845.91 |
|Household            |2055493.71 |7608339.62 |
|Frozen Foods         |1825734.79 |9434074.41 |
|Dairy                |1522594.05 |10956668.46|
|Canned               |1444151.49 |12400819.95|
|Baking Goods         |1265525.34 |13666345.29|
|Health and Hygiene   |1045200.14 |14711545.43|
|Meat                 |917565.61  |15629111.04|
|Soft Drinks          |892897.72  |16522008.76|
|Breads               |553237.19  |17075245.95|
|Hard Drinks          |457793.43  |17533039.38|
|Starchy Foods        |351401.25  |17884440.63|
|Others               |325517.61  |18209958.24|
|Breakfast            |232298.95  |18442257.19|
|Seafood              |148868.22  |18591125.41|
+---------------------+-----------+-----------+



### User Defined Functions (UDF)

In [66]:
def my_func(x):
    return x * x

In [67]:
my_udf = udf(my_func)

In [68]:
(df
 .withColumn('FooBar', my_udf('Item_MRP'))
 .select('Item_MRP', 'FooBar')
 .show(5)
)

+--------+------------------+
|Item_MRP|            FooBar|
+--------+------------------+
|249.8092|62404.636404640005|
| 48.2692|2329.9156686399997|
| 141.618|      20055.657924|
| 182.095|      33158.589025|
| 53.8614|2901.0504099600003|
+--------+------------------+
only showing top 5 rows



### Spark SQL

In [69]:
%load_ext sparksql_magic

# Create Temp View
df.createTempView('my_view')

In [70]:
%%sparksql

SELECT * FROM my_view LIMIT 10

0,1,2,3,4,5,6,7,8,9,10,11
Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
FDO10,13.65,Regular,0.012741089,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
FDP10,,Low Fat,0.127469857,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
FDH17,16.2,Regular,0.016687114,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986


In [71]:
df_sql = spark.sql("SELECT * FROM my_view WHERE Outlet_Size = 'High'")

In [72]:
df_sql.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          NCD19|       8.93|         Low Fat|            0.0|           Household| 53.8614|           OUT013|                     1987|       High|              Tier 3|Supermarket Type1|         994.7052|
|          FDO10|      13.65|         Regular|    0.012741089|         Snack Foods| 57.6588|           OUT013|                     1987|       High|              Tier 3|Superma