`select()` - is used to select single, multiple, column by index, all columns from list and the nested columns from a dataframe, PySpark `select()` is a transformation function hence it return a new dataframe with the selected columns. 

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("SP").getOrCreate()

In [2]:
# read csv file
file_path = "C:\\Users\\pcc\\Desktop\\daily-website-visitors.csv"
df=spark.read.csv(file_path,header=True,inferSchema=True)

df=df.withColumnsRenamed({"Day.Of.Week":"Day_Of_Week","Page.Loads":"Page_Loads",
                          "Unique.Visits":"Unique_Visits","First.Time.Visits":"First_Time_Visits",
                          "Returning.Visits":"Returning_Visits"})

df.show()

+---+---------+-----------+----------+----------+-------------+-----------------+----------------+
|Row|      Day|Day_Of_Week|      Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+---+---------+-----------+----------+----------+-------------+-----------------+----------------+
|  1|   Sunday|          1| 9/14/2014|      2146|         1582|             1430|             152|
|  2|   Monday|          2| 9/15/2014|      3621|         2528|             2297|             231|
|  3|  Tuesday|          3| 9/16/2014|      3698|         2630|             2352|             278|
|  4|Wednesday|          4| 9/17/2014|      3667|         2614|             2327|             287|
|  5| Thursday|          5| 9/18/2014|      3316|         2366|             2130|             236|
|  6|   Friday|          6| 9/19/2014|      2815|         1863|             1622|             241|
|  7| Saturday|          7| 9/20/2014|      1658|         1118|              985|             133|
|  8|   Su

# 1. Select Single & Multiple columns from PySpark

In [3]:
df.select("Page_Loads").show(5)

+----------+
|Page_Loads|
+----------+
|      2146|
|      3621|
|      3698|
|      3667|
|      3316|
+----------+
only showing top 5 rows



In [4]:
df.select(df.Day,df.Page_Loads).show(5)

+---------+----------+
|      Day|Page_Loads|
+---------+----------+
|   Sunday|      2146|
|   Monday|      3621|
|  Tuesday|      3698|
|Wednesday|      3667|
| Thursday|      3316|
+---------+----------+
only showing top 5 rows



In [5]:
df.select(df["Date"],df["Page_Loads"]).show(5)

+---------+----------+
|     Date|Page_Loads|
+---------+----------+
|9/14/2014|      2146|
|9/15/2014|      3621|
|9/16/2014|      3698|
|9/17/2014|      3667|
|9/18/2014|      3316|
+---------+----------+
only showing top 5 rows



In [7]:
# By using col() function
from pyspark.sql.functions import col
df.select(col("Page_Loads"),col("Day")).show(5)

+----------+---------+
|Page_Loads|      Day|
+----------+---------+
|      2146|   Sunday|
|      3621|   Monday|
|      3698|  Tuesday|
|      3667|Wednesday|
|      3316| Thursday|
+----------+---------+
only showing top 5 rows



In [8]:
# Select columns by regular expression
df.select(df.colRegex("`^.*Visits*`")).show()

+-------------+-----------------+----------------+
|Unique_Visits|First_Time_Visits|Returning_Visits|
+-------------+-----------------+----------------+
|         1582|             1430|             152|
|         2528|             2297|             231|
|         2630|             2352|             278|
|         2614|             2327|             287|
|         2366|             2130|             236|
|         1863|             1622|             241|
|         1118|              985|             133|
|         1656|             1481|             175|
|         2586|             2312|             274|
|         3257|             2989|             268|
|         3175|             2891|             284|
|         3029|             2743|             286|
|         2249|             2033|             216|
|         1180|             1040|             140|
|         1806|             1613|             193|
|         2873|             2577|             296|
|         3032|             272

# 2. Select All Columns From List

In [13]:
# Select all columns from list
df.select([col for col in df.columns]).show(5)
df.select("*").show(1)

+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
|Row|      Day|Day_Of_Week|     Date|Page_Loads|Unique_Visits|First_Time_Visits|Returning_Visits|
+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
|  1|   Sunday|          1|9/14/2014|      2146|         1582|             1430|             152|
|  2|   Monday|          2|9/15/2014|      3621|         2528|             2297|             231|
|  3|  Tuesday|          3|9/16/2014|      3698|         2630|             2352|             278|
|  4|Wednesday|          4|9/17/2014|      3667|         2614|             2327|             287|
|  5| Thursday|          5|9/18/2014|      3316|         2366|             2130|             236|
+---+---------+-----------+---------+----------+-------------+-----------------+----------------+
only showing top 5 rows

+---+------+-----------+---------+----------+-------------+-----------------+----------------

# 3. Select Columns by Index

In [14]:
# selects first 3 columns and top 3 rows
df.select(df.columns[:3]).show(3)

+---+-------+-----------+
|Row|    Day|Day_Of_Week|
+---+-------+-----------+
|  1| Sunday|          1|
|  2| Monday|          2|
|  3|Tuesday|          3|
+---+-------+-----------+
only showing top 3 rows



In [16]:
# Selects columns 2 to 4 and top 3 rows
df.select(df.columns[2:4]).show(3)

+-----------+---------+
|Day_Of_Week|     Date|
+-----------+---------+
|          1|9/14/2014|
|          2|9/15/2014|
|          3|9/16/2014|
+-----------+---------+
only showing top 3 rows



In [19]:
# Select Nested Struct Columns from PySpark

data = [
        (("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

from pyspark.sql.types import StructType,StructField, StringType        
schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])
df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [21]:
df2.select("name")

DataFrame[name: struct<firstname:string,middlename:string,lastname:string>]

In [22]:
df2.select("name.firstname","name.lastname")

DataFrame[firstname: string, lastname: string]

In [23]:
df2.select("name.*")

DataFrame[firstname: string, middlename: string, lastname: string]