<a href="https://colab.research.google.com/github/DenysNunes/data-examples/blob/main/spark/basic/dataframes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Init spark**

In [1]:
!pip install -q pyspark==3.1.1

from pyspark.sql import SparkSession


spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("New Session Example") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .enableHiveSupport() \
    .getOrCreate()

## **Single dataframe using Row object**

In [2]:
from pyspark.sql.types import Row

raw_rows = [
        Row(id=1, name='Jonh'),
        Row(id=2, name='Maria'),
        Row(id=3, name='Ben')
]

df = spark.createDataFrame(raw_rows)

df.show()

+---+-----+
| id| name|
+---+-----+
|  1| Jonh|
|  2|Maria|
|  3|  Ben|
+---+-----+



## **Using complex data**

In [3]:
raw_rows_2 = [
        Row(id=1, name='Jonh', animals=['cat', 'dog']),
        Row(id=2, name='Maria', animals=['monkey']),
        Row(id=3, name='Ben', animals=['cat', 'rat'])
]

df_2 = spark.createDataFrame(raw_rows_2)
df_2.show()

+---+-----+----------+
| id| name|   animals|
+---+-----+----------+
|  1| Jonh|[cat, dog]|
|  2|Maria|  [monkey]|
|  3|  Ben|[cat, rat]|
+---+-----+----------+



## **Dataframe with json schema**

In [4]:
from pyspark.sql.types import StructType

raw_rows_3 = [
        [1, 'Jonh', ['cat', 'dog']],
        [2, 'Maria', ['monkey']],
        [3, 'Ben', ['cat', 'rat']]
]

schema = {
   "fields":[
      {
         "metadata":{
            
         },
         "name":"id",
         "nullable":True,
         "type":"long"
      },
      {
         "metadata":{
            
         },
         "name":"name",
         "nullable":True,
         "type":"string"
      },
      {
         "metadata":{
            
         },
         "name":"animals",
         "nullable":True,
         "type":{
            "containsNull":True,
            "elementType":"string",
            "type":"array"
         }
      }
   ],
   "type":"struct"
}

struct_schema = StructType.fromJson(schema)

df_3 = spark.createDataFrame(raw_rows_3, struct_schema)
df_3.show()

+---+-----+----------+
| id| name|   animals|
+---+-----+----------+
|  1| Jonh|[cat, dog]|
|  2|Maria|  [monkey]|
|  3|  Ben|[cat, rat]|
+---+-----+----------+



## **Dataframe with typed schema**

In [5]:
from pyspark.sql.types import  StructField, StructType, StringType, LongType, ArrayType

raw_rows_4 = [
        [1, 'Jonh', ['cat', 'dog']],
        [2, 'Maria', ['monkey']],
        [3, 'Ben', ['cat', 'rat']]
]

struct_schema_2 = StructType(
    [StructField('id',LongType(),True),
     StructField('name', StringType(),True), 
     StructField('animals', ArrayType(StringType(),True),True)])

df_4 = spark.createDataFrame(raw_rows_4, struct_schema_2)
df_4.show()

+---+-----+----------+
| id| name|   animals|
+---+-----+----------+
|  1| Jonh|[cat, dog]|
|  2|Maria|  [monkey]|
|  3|  Ben|[cat, rat]|
+---+-----+----------+



## **Running SQL over dataframes**

In [6]:
raw_rows_5 = [
        Row(id=1, name='Jonh', animals=['cat', 'dog']),
        Row(id=2, name='Maria', animals=['monkey']),
        Row(id=3, name='Ben', animals=['cat', 'rat'])
]

df_5 = spark.createDataFrame(raw_rows_5)
df_5.registerTempTable("tb_teste_table")

df_6 = spark.sql("""
    SELECT * FROM tb_teste_table WHERE array_contains(animals, 'cat')
""")

df_6.show()

+---+----+----------+
| id|name|   animals|
+---+----+----------+
|  1|Jonh|[cat, dog]|
|  3| Ben|[cat, rat]|
+---+----+----------+

