In [21]:
spark.stop()

In [25]:
from pyspark.sql import SparkSession 
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StringType,IntegerType, StructField, StructType

spark = (
    SparkSession.builder.appName('Axis-pyspark-refresher')\
    .config('spark.sql.seesion.logLevel','WARN')\
    .config('spark.executor.instances', '2')\
    .config('spark.executor.cores', '4')\
    .config('spark.executor.memory', '4g')\
    .config('spark.driver.memory','2g')\
    .enableHiveSupport()\
    .getOrCreate()
)


In [29]:
emp_schema = StructType(
    [
        StructField(name='emp_id',dataType=IntegerType(),nullable=False),
        StructField(name='name', dataType=StringType(), nullable=False),
        StructField(name='age', dataType=IntegerType(), nullable=False),
        StructField(name='dept_id', dataType=IntegerType(), nullable=True)
    ]
)

emp_data = [
    (1, "John Doe", 28, 101),
    (2, "Jane Smith", 34, 102),
    (3, "Michael Johnson", 45, 103),
    (4, "Emily Davis", 25, 101),
    (5, "Sarah Wilson", 29, None)
]

dept_schema = StructType(
    [
        StructField('dept_id',IntegerType(),False),
        StructField('dept_name',StringType(),False),
    ]
)

dept_data = [
    (101, "Sales"),
    (102, "Marketing"),
    (103, "Finance"),
    (104, "HR")
]

In [30]:
emp_df = spark.createDataFrame(data=emp_data, schema=emp_schema)
dept_df = spark.createDataFrame(data=dept_data, schema=dept_schema)

In [44]:
emp_df.join(dept_df, emp_df.dept_id == dept_df.dept_id, 'inner').show()

+------+---------------+---+-------+-------+---------+
|emp_id|           name|age|dept_id|dept_id|dept_name|
+------+---------------+---+-------+-------+---------+
|     1|       John Doe| 28|    101|    101|    Sales|
|     4|    Emily Davis| 25|    101|    101|    Sales|
|     2|     Jane Smith| 34|    102|    102|Marketing|
|     3|Michael Johnson| 45|    103|    103|  Finance|
+------+---------------+---+-------+-------+---------+



In [50]:
from pyspark.sql.functions import lit, col

In [53]:
emp_df.union(emp_df.select('emp_id','name','age',lit(None).alias('dept_id'))).show()

+------+---------------+---+-------+
|emp_id|           name|age|dept_id|
+------+---------------+---+-------+
|     1|       John Doe| 28|    101|
|     2|     Jane Smith| 34|    102|
|     3|Michael Johnson| 45|    103|
|     4|    Emily Davis| 25|    101|
|     5|   Sarah Wilson| 29|   NULL|
|     1|       John Doe| 28|   NULL|
|     2|     Jane Smith| 34|   NULL|
|     3|Michael Johnson| 45|   NULL|
|     4|    Emily Davis| 25|   NULL|
|     5|   Sarah Wilson| 29|   NULL|
+------+---------------+---+-------+



In [63]:
emp_df_repartitioned = emp_df.repartition(200)

In [59]:
emp_df_repartitioned.describe().show()

24/11/02 20:49:51 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------------------+------------+-----------------+------------------+
|summary|            emp_id|        name|              age|           dept_id|
+-------+------------------+------------+-----------------+------------------+
|  count|                 5|           5|                5|                 4|
|   mean|               3.0|        NULL|             32.2|            101.75|
| stddev|1.5811388300841898|        NULL|7.854934754662192|0.9574271077563339|
|    min|                 1| Emily Davis|               25|               101|
|    max|                 5|Sarah Wilson|               45|               103|
+-------+------------------+------------+-----------------+------------------+



In [64]:
emp_df_repartitioned.rdd.getNumPartitions()

200