In [4]:
from pyspark.sql import SparkSession,Row

In [2]:
spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()

In [5]:
header =Row("ID", "Name", "gender", "Salary", "Department")

In [7]:
employee_df = spark.sparkContext.parallelize([
    header(1, "Deva", "Male", 5000, "Sales"), 
    header(2, "Jugnu", "Female", 6000, "HR"), 
    header(3, "Kavita", "Female", 7500, "IT"), 
    header(4, "Vikram", "Male", 6500, "Marketing"), 
    header(5, "Shabana", "Female", 5500, "Finance"), 
    header(6, "Shantilal", "Male", 8000, "Sales"), 
    header(7, "Vinod","Male", 7200, "HR"), 
    header(8, "Vimla", "Female", 6600, "IT"), 
    header(9, "Jasmin", "Female", 5400, "Marketing"), 
    header(10, "Lovely", "Female", 6300, "Finance"), 
    header(11, "Mohan", "Male", 5700, "Sales"), 
    header(12, "Purvish", "Male", 7000, "HR"), 
    header(13, "Jinat", "Female", 7100, "IT"), 
    header(14, "Eva", "Female", 6800,"Marketing"), 
    header(15, "Jitendra", "Male", 5000, "Finance"), 
    header(15, "Rajkumar", "Male", 4500, "Finance"), 
    header(15, "Satish", "Male", 4500, "Finance"), 
    header(15, "Himmat", "Male", 3500, "Finance")], 2).toDF()

In [8]:
#Check the data in DataFrame 
employee_df.show()

+---+---------+------+------+----------+
| ID|     Name|gender|Salary|Department|
+---+---------+------+------+----------+
|  1|     Deva|  Male|  5000|     Sales|
|  2|    Jugnu|Female|  6000|        HR|
|  3|   Kavita|Female|  7500|        IT|
|  4|   Vikram|  Male|  6500| Marketing|
|  5|  Shabana|Female|  5500|   Finance|
|  6|Shantilal|  Male|  8000|     Sales|
|  7|    Vinod|  Male|  7200|        HR|
|  8|    Vimla|Female|  6600|        IT|
|  9|   Jasmin|Female|  5400| Marketing|
| 10|   Lovely|Female|  6300|   Finance|
| 11|    Mohan|  Male|  5700|     Sales|
| 12|  Purvish|  Male|  7000|        HR|
| 13|    Jinat|Female|  7100|        IT|
| 14|      Eva|Female|  6800| Marketing|
| 15| Jitendra|  Male|  5000|   Finance|
| 15| Rajkumar|  Male|  4500|   Finance|
| 15|   Satish|  Male|  4500|   Finance|
| 15|   Himmat|  Male|  3500|   Finance|
+---+---------+------+------+----------+



In [9]:
from pyspark.sql.functions import expr

In [10]:
#Now create an expression. These expressions are Column types. 
male_expr = expr("gender='Male'") 
female_expr= expr("gender='Female'") 
sal_expr= expr("Salary >=6600")

In [11]:
#Now apply these filters to the data 
employee_df.filter(male_expr).show() 
employee_df.filter(female_expr).show() 
employee_df.filter(sal_expr).show()

+---+---------+------+------+----------+
| ID|     Name|gender|Salary|Department|
+---+---------+------+------+----------+
|  1|     Deva|  Male|  5000|     Sales|
|  4|   Vikram|  Male|  6500| Marketing|
|  6|Shantilal|  Male|  8000|     Sales|
|  7|    Vinod|  Male|  7200|        HR|
| 11|    Mohan|  Male|  5700|     Sales|
| 12|  Purvish|  Male|  7000|        HR|
| 15| Jitendra|  Male|  5000|   Finance|
| 15| Rajkumar|  Male|  4500|   Finance|
| 15|   Satish|  Male|  4500|   Finance|
| 15|   Himmat|  Male|  3500|   Finance|
+---+---------+------+------+----------+

+---+-------+------+------+----------+
| ID|   Name|gender|Salary|Department|
+---+-------+------+------+----------+
|  2|  Jugnu|Female|  6000|        HR|
|  3| Kavita|Female|  7500|        IT|
|  5|Shabana|Female|  5500|   Finance|
|  8|  Vimla|Female|  6600|        IT|
|  9| Jasmin|Female|  5400| Marketing|
| 10| Lovely|Female|  6300|   Finance|
| 13|  Jinat|Female|  7100|        IT|
| 14|    Eva|Female|  6800| Marketi

In [12]:
from pyspark.sql.functions import array

In [17]:
#Now lets create array by combining multiple columns in DataFrame and drop the same column from output 
array_df = employee_df.filter(sal_expr)\
.withColumn("Array" , array("Name","gender", "Department"))\
.drop("Name","gende", "Department")
array_df.printSchema()
array_df.show()

root
 |-- ID: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Array: array (nullable = false)
 |    |-- element: string (containsNull = true)

+---+------+------+--------------------+
| ID|gender|Salary|               Array|
+---+------+------+--------------------+
|  3|Female|  7500|[Kavita, Female, IT]|
|  6|  Male|  8000|[Shantilal, Male,...|
|  7|  Male|  7200|   [Vinod, Male, HR]|
|  8|Female|  6600| [Vimla, Female, IT]|
| 12|  Male|  7000| [Purvish, Male, HR]|
| 13|Female|  7100| [Jinat, Female, IT]|
| 14|Female|  6800|[Eva, Female, Mar...|
+---+------+------+--------------------+



In [14]:
from pyspark.sql.functions import struct

In [16]:
#Now lets create array by combining multiple columns in DataFrame and drop the same column from output  
struct_df = employee_df.filter(sal_expr)\
.withColumn("Struct", struct("Name","gender", "Department"))\
.drop("Name","gende", "Department")

struct_df.printSchema()

struct_df.show()

root
 |-- ID: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Struct: struct (nullable = false)
 |    |-- Name: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- Department: string (nullable = true)

+---+------+------+--------------------+
| ID|gender|Salary|              Struct|
+---+------+------+--------------------+
|  3|Female|  7500|[Kavita, Female, IT]|
|  6|  Male|  8000|[Shantilal, Male,...|
|  7|  Male|  7200|   [Vinod, Male, HR]|
|  8|Female|  6600| [Vimla, Female, IT]|
| 12|  Male|  7000| [Purvish, Male, HR]|
| 13|Female|  7100| [Jinat, Female, IT]|
| 14|Female|  6800|[Eva, Female, Mar...|
+---+------+------+--------------------+



In [18]:
#Even both support mixed datatypes as well 
employee_df.filter(sal_expr)\
.withColumn("Array" , array("Name","gender", "Department"))\
.drop("gender", "Department", "Salary").show() 

employee_df.filter(sal_expr)\
.withColumn("Struct", struct("Name","gender", "Department"))\
.drop("gender", "Department", "Salary").show()

+---+---------+--------------------+
| ID|     Name|               Array|
+---+---------+--------------------+
|  3|   Kavita|[Kavita, Female, IT]|
|  6|Shantilal|[Shantilal, Male,...|
|  7|    Vinod|   [Vinod, Male, HR]|
|  8|    Vimla| [Vimla, Female, IT]|
| 12|  Purvish| [Purvish, Male, HR]|
| 13|    Jinat| [Jinat, Female, IT]|
| 14|      Eva|[Eva, Female, Mar...|
+---+---------+--------------------+

+---+---------+--------------------+
| ID|     Name|              Struct|
+---+---------+--------------------+
|  3|   Kavita|[Kavita, Female, IT]|
|  6|Shantilal|[Shantilal, Male,...|
|  7|    Vinod|   [Vinod, Male, HR]|
|  8|    Vimla| [Vimla, Female, IT]|
| 12|  Purvish| [Purvish, Male, HR]|
| 13|    Jinat| [Jinat, Female, IT]|
| 14|      Eva|[Eva, Female, Mar...|
+---+---------+--------------------+



In [19]:
#monotonically_increasing_id() 
#Lets create data with 4 partitions 
employee_df_2 = spark.sparkContext.parallelize([
    header(1, "Deva", "Male", 5000, "Sales"), 
    header(2, "Jugnu", "Female", 6000, "HR"), 
    header(3, "Kavita", "Female", 7500, "IT"), 
    header(4, "Vikram", "Male", 6500, "Marketing"),
    header(5, "Shabana", "Female", 5500, "Finance"), 
    header(6, "Shantilal", "Male", 8000, "Sales"), 
    header(7, "Vinod", "Male", 7200, "HR"), 
    header(8, "Vimla", "Female", 6600, "IT"),
    header(9, "Jasmin", "Female", 5400, "Marketing"),
    header(10, "Lovely", "Female", 6300, "Finance"), 
    header(11, "Mohan", "Male", 5700, "Sales"), 
    header(12, "Purvish", "Male", 7000, "HR"), 
    header(13, "Jinat", "Female", 7100, "IT"), 
    header(14, "Eva", "Female", 6800,"Marketing"), 
    header(15, "Jitendra", "Male", 5000, "Finance") , 
    header(15, "Rajkumar", "Male", 4500, "Finance") , 
    header(15, "Satish", "Male", 4500, "Finance") , 
    header(15, "Himmat", "Male", 3500, "Finance")], 4).toDF()

In [21]:
# Now generate monotonically_increasing_id() for each row. 
#It is a 64 bit integers 
#Generated ID must be unique and increasing only. 
#It can not be consecutive 
#The current implementation puts the partition ID in the upper 31 bits, 
# and the record number within each partition in the lower 33 bits.  
#The assumption is that the data frame has less than 1 billion partitions,
#and each partition has less than 8 billion records. 

from pyspark.sql.functions import monotonically_increasing_id

In [23]:
employee_df_2.withColumn("unique_id", monotonically_increasing_id()).show()

+---+---------+------+------+----------+-----------+
| ID|     Name|gender|Salary|Department|  unique_id|
+---+---------+------+------+----------+-----------+
|  1|     Deva|  Male|  5000|     Sales|          0|
|  2|    Jugnu|Female|  6000|        HR|          1|
|  3|   Kavita|Female|  7500|        IT|          2|
|  4|   Vikram|  Male|  6500| Marketing|          3|
|  5|  Shabana|Female|  5500|   Finance| 8589934592|
|  6|Shantilal|  Male|  8000|     Sales| 8589934593|
|  7|    Vinod|  Male|  7200|        HR| 8589934594|
|  8|    Vimla|Female|  6600|        IT| 8589934595|
|  9|   Jasmin|Female|  5400| Marketing|17179869184|
| 10|   Lovely|Female|  6300|   Finance|17179869185|
| 11|    Mohan|  Male|  5700|     Sales|17179869186|
| 12|  Purvish|  Male|  7000|        HR|17179869187|
| 13|    Jinat|Female|  7100|        IT|25769803776|
| 14|      Eva|Female|  6800| Marketing|25769803777|
| 15| Jitendra|  Male|  5000|   Finance|25769803778|
| 15| Rajkumar|  Male|  4500|   Finance|257698