In [2]:
from pyspark.sql import SparkSession


spark = (
    SparkSession
    .builder
    .appName("spark introduction")
    .master("local[*]")
    .getOrCreate()
)


In [3]:
spark

In [4]:
emp_schema = "emp_id string, emp_name string, department string,role string ,salary string"


# Employee data (list of rows)
emp_data = [
    [101, "Alice", "HR", "Manager", 60000],
    [102, "Bob", "IT", "Developer", 75000],
    [103, "Charlie", "Finance", "Analyst", 65000],
    [104, "Diana", "IT", "Tester", 55000],
    [105, "Evan", "Sales", "Executive", 50000]
]

In [5]:
# check number of partition :
emp = spark.createDataFrame(data=emp_data,schema=emp_schema)

In [6]:
emp.show()

+------+--------+----------+---------+------+
|emp_id|emp_name|department|     role|salary|
+------+--------+----------+---------+------+
|   101|   Alice|        HR|  Manager| 60000|
|   102|     Bob|        IT|Developer| 75000|
|   103| Charlie|   Finance|  Analyst| 65000|
|   104|   Diana|        IT|   Tester| 55000|
|   105|    Evan|     Sales|Executive| 50000|
+------+--------+----------+---------+------+



In [7]:
emp.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- salary: string (nullable = true)



In [8]:
emp.schema

StructType([StructField('emp_id', StringType(), True), StructField('emp_name', StringType(), True), StructField('department', StringType(), True), StructField('role', StringType(), True), StructField('salary', StringType(), True)])

In [9]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [15]:
schema_string = "name string, age int"

schema_spark = StructType([StructField('emp_id', StringType(), True),StructField('age', IntegerType(), True)])

In [11]:
#columns and experession 
from pyspark.sql.functions import col,expr

col("name")
expr("age")

Column<'age'>

In [22]:
emp_filter = emp.select(col('emp_name'),expr("salary"),emp.emp_id)

In [23]:
emp_filter.show()

+--------+------+------+
|emp_name|salary|emp_id|
+--------+------+------+
|   Alice| 60000|   101|
|     Bob| 75000|   102|
| Charlie| 65000|   103|
|   Diana| 55000|   104|
|    Evan| 50000|   105|
+--------+------+------+



In [12]:
emp_casted = emp.select(expr("emp_name as eid"),emp.emp_name,expr("cast(salary as int) as sal"),emp.role)

In [13]:
emp_casted.show()

+-------+--------+-----+---------+
|    eid|emp_name|  sal|     role|
+-------+--------+-----+---------+
|  Alice|   Alice|60000|  Manager|
|    Bob|     Bob|75000|Developer|
|Charlie| Charlie|65000|  Analyst|
|  Diana|   Diana|55000|   Tester|
|   Evan|    Evan|50000|Executive|
+-------+--------+-----+---------+



In [15]:
# more transformation function

# adding columns 
# using literals/static values
# renaming columns 
# removing columns 
# filtering and limit for data frame 
# structure transformation withcolumn, withcolumnrenamed,lit


In [16]:
emp.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- salary: string (nullable = true)



In [17]:
from pyspark.sql.functions import col,cast

In [18]:
emp_casted = emp.select("emp_id","emp_name","role",col("salary").cast("double"))

In [19]:
emp_casted.schema

StructType([StructField('emp_id', StringType(), True), StructField('emp_name', StringType(), True), StructField('role', StringType(), True), StructField('salary', DoubleType(), True)])

In [22]:
# adding columns

emp_tax = emp_casted.withColumn("tax",col("salary")*0.2)

In [23]:
emp_tax.show()

+------+--------+---------+-------+-------+
|emp_id|emp_name|     role| salary|    tax|
+------+--------+---------+-------+-------+
|   101|   Alice|  Manager|60000.0|12000.0|
|   102|     Bob|Developer|75000.0|15000.0|
|   103| Charlie|  Analyst|65000.0|13000.0|
|   104|   Diana|   Tester|55000.0|11000.0|
|   105|    Evan|Executive|50000.0|10000.0|
+------+--------+---------+-------+-------+



In [25]:
# literals
from pyspark.sql.functions import lit 

# adding two columns 
emp_new_cols = emp_tax.withColumn("columnsOne",lit(1)).withColumn("columnTwo",lit("2"))

In [26]:
emp_new_cols.show()

+------+--------+---------+-------+-------+----------+---------+
|emp_id|emp_name|     role| salary|    tax|columnsOne|columnTwo|
+------+--------+---------+-------+-------+----------+---------+
|   101|   Alice|  Manager|60000.0|12000.0|         1|        2|
|   102|     Bob|Developer|75000.0|15000.0|         1|        2|
|   103| Charlie|  Analyst|65000.0|13000.0|         1|        2|
|   104|   Diana|   Tester|55000.0|11000.0|         1|        2|
|   105|    Evan|Executive|50000.0|10000.0|         1|        2|
+------+--------+---------+-------+-------+----------+---------+



In [27]:
# renaming column 
# alternate :expr , selectexpr
emp_1 = emp_new_cols.withColumnRenamed("emp_id","eid")

In [28]:
emp_1.show()

+---+--------+---------+-------+-------+----------+---------+
|eid|emp_name|     role| salary|    tax|columnsOne|columnTwo|
+---+--------+---------+-------+-------+----------+---------+
|101|   Alice|  Manager|60000.0|12000.0|         1|        2|
|102|     Bob|Developer|75000.0|15000.0|         1|        2|
|103| Charlie|  Analyst|65000.0|13000.0|         1|        2|
|104|   Diana|   Tester|55000.0|11000.0|         1|        2|
|105|    Evan|Executive|50000.0|10000.0|         1|        2|
+---+--------+---------+-------+-------+----------+---------+



In [30]:
# column with spaces: not used in production
emp_2 = emp_new_cols.withColumnRenamed("columnsOne","column one")

emp_2.show()

+------+--------+---------+-------+-------+----------+---------+
|emp_id|emp_name|     role| salary|    tax|column one|columnTwo|
+------+--------+---------+-------+-------+----------+---------+
|   101|   Alice|  Manager|60000.0|12000.0|         1|        2|
|   102|     Bob|Developer|75000.0|15000.0|         1|        2|
|   103| Charlie|  Analyst|65000.0|13000.0|         1|        2|
|   104|   Diana|   Tester|55000.0|11000.0|         1|        2|
|   105|    Evan|Executive|50000.0|10000.0|         1|        2|
+------+--------+---------+-------+-------+----------+---------+



In [33]:
# remove columns 

emp_dropped = emp_new_cols.drop("columnTwo")

In [34]:
emp_dropped.show()

+------+--------+---------+-------+-------+----------+
|emp_id|emp_name|     role| salary|    tax|columnsOne|
+------+--------+---------+-------+-------+----------+
|   101|   Alice|  Manager|60000.0|12000.0|         1|
|   102|     Bob|Developer|75000.0|15000.0|         1|
|   103| Charlie|  Analyst|65000.0|13000.0|         1|
|   104|   Diana|   Tester|55000.0|11000.0|         1|
|   105|    Evan|Executive|50000.0|10000.0|         1|
+------+--------+---------+-------+-------+----------+



In [37]:
# filter data 

emp_.where("tax > 10000").show()

+------+--------+---------+-------+-------+----------+
|emp_id|emp_name|     role| salary|    tax|columnsOne|
+------+--------+---------+-------+-------+----------+
|   101|   Alice|  Manager|60000.0|12000.0|         1|
|   102|     Bob|Developer|75000.0|15000.0|         1|
|   103| Charlie|  Analyst|65000.0|13000.0|         1|
|   104|   Diana|   Tester|55000.0|11000.0|         1|
+------+--------+---------+-------+-------+----------+



In [39]:
# limit data 
emp_limit = emp_new_cols.limit(2)

In [40]:
emp_limit.show()

+------+--------+---------+-------+-------+----------+---------+
|emp_id|emp_name|     role| salary|    tax|columnsOne|columnTwo|
+------+--------+---------+-------+-------+----------+---------+
|   101|   Alice|  Manager|60000.0|12000.0|         1|        2|
|   102|     Bob|Developer|75000.0|15000.0|         1|        2|
+------+--------+---------+-------+-------+----------+---------+



In [41]:
# add multiple columns
# using dict 
columns = {
   "tax" : col("salary")*0.2,
    "one" : lit(1),
    "two" : lit(2)

}

emp_final = emp.withColumns(columns)

In [42]:
emp_final.show()

+------+--------+----------+---------+------+-------+---+---+
|emp_id|emp_name|department|     role|salary|    tax|one|two|
+------+--------+----------+---------+------+-------+---+---+
|   101|   Alice|        HR|  Manager| 60000|12000.0|  1|  2|
|   102|     Bob|        IT|Developer| 75000|15000.0|  1|  2|
|   103| Charlie|   Finance|  Analyst| 65000|13000.0|  1|  2|
|   104|   Diana|        IT|   Tester| 55000|11000.0|  1|  2|
|   105|    Evan|     Sales|Executive| 50000|10000.0|  1|  2|
+------+--------+----------+---------+------+-------+---+---+

