# SCHEMA COMPARISON

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 13:35:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Dataframes

In [3]:
data_catalog_v1 = [
    (1, 'Harry poter', 'Books', 120, True),
    (2, 'Microwave', 'Kitchen', None, True),
    (3, 'Jacket',180, None,None,),
    (4, None, 'Furniture', 250, False),
    (5, 'Xbox360', 'Toys', None, False)
]

schema_catalog_v1 = "id INTEGER, item STRING, category STRING,price INTEGER, available BOOLEAN"

In [4]:
df_catalog_v1 = spark.createDataFrame(data=data_catalog_v1,schema=schema_catalog_v1)
df_catalog_v1.show()

                                                                                

+---+-----------+---------+-----+---------+
| id|       item| category|price|available|
+---+-----------+---------+-----+---------+
|  1|Harry poter|    Books|  120|     true|
|  2|  Microwave|  Kitchen| NULL|     true|
|  3|     Jacket|      180| NULL|     NULL|
|  4|       NULL|Furniture|  250|    false|
|  5|    Xbox360|     Toys| NULL|    false|
+---+-----------+---------+-----+---------+



In [5]:
data_catalog_v2 = [
    (1, 'Harry poter', 'Books', 120, True),
    (2, 'Microwave', 'Kitchen', None, True),
    (3, 'Jacket',180, None,None,),
    (4, None, 'Furniture', 250, False),
    (5, 'Xbox360', 'Toys', None, False)
]

schema_catalog_v2 = "id INTEGER, name STRING, catalog STRING, value INTEGER, exists BOOLEAN"

In [6]:
df_catalog_v2 = spark.createDataFrame(data=data_catalog_v2,schema=schema_catalog_v2)
df_catalog_v2.show()

+---+-----------+---------+-----+------+
| id|       name|  catalog|value|exists|
+---+-----------+---------+-----+------+
|  1|Harry poter|    Books|  120|  true|
|  2|  Microwave|  Kitchen| NULL|  true|
|  3|     Jacket|      180| NULL|  NULL|
|  4|       NULL|Furniture|  250| false|
|  5|    Xbox360|     Toys| NULL| false|
+---+-----------+---------+-----+------+



In [7]:
data_catalog_v3 = [
    (1, True, 'Books', 120.5, ("kids","+8y")),
    (2, False, 'Kitchen', None, ("home","electric")),
    (3, False,180, None,None,),
    (4, None, 'Furniture', 250.94, ("wood",None)),
    (5, True, 'Toys', 15.20, ("blue","childs","disney"))
]

schema_catalog_v3 = "id INTEGER, active BOOLEAN, catalog STRING, value FLOAT, extra ARRAY<STRING>"

In [8]:
df_catalog_v3 = spark.createDataFrame(data=data_catalog_v3,schema=schema_catalog_v3)
df_catalog_v3.show(truncate=False)

+---+------+---------+------+----------------------+
|id |active|catalog  |value |extra                 |
+---+------+---------+------+----------------------+
|1  |true  |Books    |120.5 |[kids, +8y]           |
|2  |false |Kitchen  |NULL  |[home, electric]      |
|3  |false |180      |NULL  |NULL                  |
|4  |NULL  |Furniture|250.94|[wood, NULL]          |
|5  |true  |Toys     |15.2  |[blue, childs, disney]|
+---+------+---------+------+----------------------+



## CHECK SCHEMA

In [9]:
df_catalog_v1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- item: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- available: boolean (nullable = true)



In [10]:
df_catalog_v2.schema

StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('catalog', StringType(), True), StructField('value', IntegerType(), True), StructField('exists', BooleanType(), True)])

In [11]:
df_catalog_v3.schema.json()

'{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"integer"},{"metadata":{},"name":"active","nullable":true,"type":"boolean"},{"metadata":{},"name":"catalog","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"float"},{"metadata":{},"name":"extra","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"}'

## COMPARE SCHEMAS

In [12]:
df_catalog_v1.schema == df_catalog_v1.schema

True

In [13]:
df_catalog_v1.schema == df_catalog_v2.schema

False

In [14]:
df_catalog_v2.schema == df_catalog_v3.schema

False

## CHECK COLUMN DIFFERENCES

In [15]:
set(df_catalog_v1.columns) - set(df_catalog_v2.columns)

{'available', 'category', 'item', 'price'}

In [16]:
set(df_catalog_v2.columns) - set(df_catalog_v1.columns)

{'catalog', 'exists', 'name', 'value'}

In [17]:
set(df_catalog_v1.columns) - set(df_catalog_v3.columns)

{'available', 'category', 'item', 'price'}

In [18]:
set(df_catalog_v3.columns) - set(df_catalog_v1.columns)

{'active', 'catalog', 'extra', 'value'}

In [19]:
set(df_catalog_v1.columns) - set(df_catalog_v2.columns)

{'available', 'category', 'item', 'price'}

In [20]:
set(df_catalog_v3.columns) - set(df_catalog_v2.columns)

{'active', 'extra'}

## Collect All Columns

In [21]:
print(list(set(df_catalog_v1.columns+df_catalog_v2.columns)))

['catalog', 'value', 'price', 'item', 'available', 'category', 'id', 'exists', 'name']


In [22]:
print(list(set(df_catalog_v2.columns+df_catalog_v3.columns)))

['extra', 'catalog', 'value', 'id', 'active', 'exists', 'name']


In [23]:
print(list(set(df_catalog_v1.columns+df_catalog_v3.columns)))

['extra', 'catalog', 'value', 'price', 'item', 'available', 'category', 'id', 'active']
