# SELECT VS WITH COLUMN

## Raw dataframe

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/29 20:36:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_students = [
    ("Alice", "Johnson", "Math", 85, 72, 65, 88, 94, True, 95),
    ("Alice", "Johnson", "English", 78, 84, 57, 91, 89, True, 88),
    ("Alice", "Johnson", "History", 92, 76, 82, 93, 80, True, 97),
    ("Alice", "Johnson", "Science", 67, 70, 85, 66, 73, True, 80),
    ("Alice", "Johnson", "Art", 55, 59, 47, 60, 62, False, 70),
    
    ("Bob", "Smith", "Math", 72, 66, 81, 75, 83, True, 85),
    ("Bob", "Smith", "English", 81, 92, 74, 68, 77, True, 90),
    ("Bob", "Smith", "History", 60, 54, 69, 61, 65, True, 75),
    ("Bob", "Smith", "Science", 50, 48, 56, 53, 60, False, 65),
    ("Bob", "Smith", "Art", 58, 61, 63, 57, 55, True, 77),

    ("Charlie", "Wood", "Math", 93, 88, 79, 91, 85, True, 98),
    ("Charlie", "Wood", "English", 89, 83, 87, 92, 90, True, 92),
    ("Charlie", "Wood", "History", 74, 68, 77, 72, 70, True, 84),
    ("Charlie", "Wood", "Science", 88, 91, 85, 87, 89, True, 91),
    ("Charlie", "Wood", "Art", 79, 74, 81, 76, 78, True, 87),

    ("Diane", "Brown", "Math", 61, 55, 64, 58, 62, True, 82),
    ("Diane", "Brown", "English", 57, 49, 53, 61, 50, False, 68),
    ("Diane", "Brown", "History", 69, 63, 71, 66, 67, True, 80),
    ("Diane", "Brown", "Science", 73, 78, 70, 74, 72, True, 85),
    ("Diane", "Brown", "Art", 62, 58, 65, 60, 63, True, 76),

    ("Eve", "Davis", "Math", 54, 47, 52, 56, 50, False, 66),
    ("Eve", "Davis", "English", 48, 42, 45, 49, 44, False, 58),
    ("Eve", "Davis", "History", 63, 68, 61, 66, 62, True, 77),
    ("Eve", "Davis", "Science", 70, 73, 75, 69, 71, True, 83),
    ("Eve", "Davis", "Art", 66, 64, 69, 61, 67, True, 81)
]


schema_students = "first_name STRING, last_name STRING, suject STRING, exam_1 INTEGER, exam_2 INTEGER, exam_3 INTEGER, exam_4 INTEGER, exam_5 INTEGER, approved BOOLEAN, attendance INTEGER"

In [4]:
df_students = spark.createDataFrame(data=data_students,schema=schema_students)

df_students.show(truncate=False)

                                                                                

+----------+---------+-------+------+------+------+------+------+--------+----------+
|first_name|last_name|suject |exam_1|exam_2|exam_3|exam_4|exam_5|approved|attendance|
+----------+---------+-------+------+------+------+------+------+--------+----------+
|Alice     |Johnson  |Math   |85    |72    |65    |88    |94    |true    |95        |
|Alice     |Johnson  |English|78    |84    |57    |91    |89    |true    |88        |
|Alice     |Johnson  |History|92    |76    |82    |93    |80    |true    |97        |
|Alice     |Johnson  |Science|67    |70    |85    |66    |73    |true    |80        |
|Alice     |Johnson  |Art    |55    |59    |47    |60    |62    |false   |70        |
|Bob       |Smith    |Math   |72    |66    |81    |75    |83    |true    |85        |
|Bob       |Smith    |English|81    |92    |74    |68    |77    |true    |90        |
|Bob       |Smith    |History|60    |54    |69    |61    |65    |true    |75        |
|Bob       |Smith    |Science|50    |48    |56    |53 

## Add new columns multiple withColumns

In [5]:
from pyspark.sql.functions import col, lit, concat, current_date, when, round

In [6]:
df_multi_with = df_students.withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name"))) \
                            .withColumn("average", round((col("exam_1") + col("exam_2") + col("exam_3") + col("exam_4") + col("exam_5")) / 5, 2)) \
                            .withColumn("final_exam_day", current_date()+30) \
                            .withColumn("extra_point",when(col("attendance") > 80, True).otherwise(False))
df_multi_with.show()

                                                                                

+----------+---------+-------+------+------+------+------+------+--------+----------+-------------+-------+--------------+-----------+
|first_name|last_name| suject|exam_1|exam_2|exam_3|exam_4|exam_5|approved|attendance|    full_name|average|final_exam_day|extra_point|
+----------+---------+-------+------+------+------+------+------+--------+----------+-------------+-------+--------------+-----------+
|     Alice|  Johnson|   Math|    85|    72|    65|    88|    94|    true|        95|Alice Johnson|   80.8|    2024-09-28|       true|
|     Alice|  Johnson|English|    78|    84|    57|    91|    89|    true|        88|Alice Johnson|   79.8|    2024-09-28|       true|
|     Alice|  Johnson|History|    92|    76|    82|    93|    80|    true|        97|Alice Johnson|   84.6|    2024-09-28|       true|
|     Alice|  Johnson|Science|    67|    70|    85|    66|    73|    true|        80|Alice Johnson|   72.2|    2024-09-28|      false|
|     Alice|  Johnson|    Art|    55|    59|    47|    

                                                                                

## Add new columns multiple dataframe

In [7]:
df_multi_df = df_students.withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name")))

df_multi_df = df_multi_df.withColumn("average", round((col("exam_1") + col("exam_2") + col("exam_3") + col("exam_4") + col("exam_5")) / 5, 2))
df_multi_df = df_multi_df.withColumn("final_exam_day", current_date()+30)

df_multi_df = df_multi_df.withColumn("extra_point",when(col("attendance") > 80, True).otherwise(False))

df_multi_df.show()

                                                                                

+----------+---------+-------+------+------+------+------+------+--------+----------+-------------+-------+--------------+-----------+
|first_name|last_name| suject|exam_1|exam_2|exam_3|exam_4|exam_5|approved|attendance|    full_name|average|final_exam_day|extra_point|
+----------+---------+-------+------+------+------+------+------+--------+----------+-------------+-------+--------------+-----------+
|     Alice|  Johnson|   Math|    85|    72|    65|    88|    94|    true|        95|Alice Johnson|   80.8|    2024-09-28|       true|
|     Alice|  Johnson|English|    78|    84|    57|    91|    89|    true|        88|Alice Johnson|   79.8|    2024-09-28|       true|
|     Alice|  Johnson|History|    92|    76|    82|    93|    80|    true|        97|Alice Johnson|   84.6|    2024-09-28|       true|
|     Alice|  Johnson|Science|    67|    70|    85|    66|    73|    true|        80|Alice Johnson|   72.2|    2024-09-28|      false|
|     Alice|  Johnson|    Art|    55|    59|    47|    

## Select

In [8]:
df_select = df_students.select("*",
                               concat(col("first_name"), lit(" "), col("last_name")).alias("full_name"),
                               round((col("exam_1") + col("exam_2") + col("exam_3") + col("exam_4") + col("exam_5")) / 5, 2).alias("average"),
                               (current_date()+30).alias("final_exam_day"), 
                               when(col("attendance") > 80, True).otherwise(False).alias("extra_point"))
df_select.show()

+----------+---------+-------+------+------+------+------+------+--------+----------+-------------+-------+--------------+-----------+
|first_name|last_name| suject|exam_1|exam_2|exam_3|exam_4|exam_5|approved|attendance|    full_name|average|final_exam_day|extra_point|
+----------+---------+-------+------+------+------+------+------+--------+----------+-------------+-------+--------------+-----------+
|     Alice|  Johnson|   Math|    85|    72|    65|    88|    94|    true|        95|Alice Johnson|   80.8|    2024-09-28|       true|
|     Alice|  Johnson|English|    78|    84|    57|    91|    89|    true|        88|Alice Johnson|   79.8|    2024-09-28|       true|
|     Alice|  Johnson|History|    92|    76|    82|    93|    80|    true|        97|Alice Johnson|   84.6|    2024-09-28|       true|
|     Alice|  Johnson|Science|    67|    70|    85|    66|    73|    true|        80|Alice Johnson|   72.2|    2024-09-28|      false|
|     Alice|  Johnson|    Art|    55|    59|    47|    