In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("JupyterStandalo") \
    .master("spark://8fa087ac675c:7077") \
    .config("spark.executor.instances", "3") \
    .config("spark.executor.cores", "6") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/24 10:05:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sc=spark.sparkContext

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
# Initialize our data
data2 = [("Pulkit", 12, "CS32", 82, "Programming"),
         ("Ritika", 20, "CS32", 94, "Writing"),
         ("Atirikt", 4, "BB21", 78, None),
         ("Reshav", 18, None, 56, None)
         ]

# Start spark session

# Define schema
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Roll Number", IntegerType(), True),
    StructField("Class ID", StringType(), True),
    StructField("Marks", IntegerType(), True),
    StructField("Extracurricular", StringType(), True)
])

# read the dataframe
df = spark.createDataFrame(data=data2, schema=schema)

In [4]:
df.show()

                                                                                

+-------+-----------+--------+-----+---------------+
|   Name|Roll Number|Class ID|Marks|Extracurricular|
+-------+-----------+--------+-----+---------------+
| Pulkit|         12|    CS32|   82|    Programming|
| Ritika|         20|    CS32|   94|        Writing|
|Atirikt|          4|    BB21|   78|           NULL|
| Reshav|         18|    NULL|   56|           NULL|
+-------+-----------+--------+-----+---------------+



# 1: Select single or multiple columns

We can select single or multiple columns using the select() function by specifying the particular column name

In [5]:
df1=df.select("Name", "Marks","Extracurricular")
df2=df.select("Name", "Marks")

In [6]:
df1.show()

                                                                                

+-------+-----+---------------+
|   Name|Marks|Extracurricular|
+-------+-----+---------------+
| Pulkit|   82|    Programming|
| Ritika|   94|        Writing|
|Atirikt|   78|           NULL|
| Reshav|   56|           NULL|
+-------+-----+---------------+



In [7]:
df2.show()

+-------+-----+
|   Name|Marks|
+-------+-----+
| Pulkit|   82|
| Ritika|   94|
|Atirikt|   78|
| Reshav|   56|
+-------+-----+



In [8]:
# just visualize it as new rdd and dataframe built on top of that

In [11]:
df.columns
# this is neither transformation nor action
# its just like a function that is executed in this driver

['Name', 'Roll Number', 'Class ID', 'Marks', 'Extracurricular']

In [10]:
# now you can use this approach to select as well
df.select(["Name","Marks"]).show()

+-------+-----+
|   Name|Marks|
+-------+-----+
| Pulkit|   82|
| Ritika|   94|
|Atirikt|   78|
| Reshav|   56|
+-------+-----+



# 2.Select columns using indexing

Indexing provides an easy way of accessing columns inside a dataframe. Indexing starts from 0 and has total n-1 numbers representing each column with 0 as first and n-1 as last nth column. We can use df.columns to access all the columns and use indexing to pass in the required columns inside a select function.

In [12]:
# so you can do 
df.select(df.columns[:2]).show()

+-------+-----------+
|   Name|Roll Number|
+-------+-----------+
| Pulkit|         12|
| Ritika|         20|
|Atirikt|          4|
| Reshav|         18|
+-------+-----------+



# 3 


In [14]:
df.select(df.Name,df.Marks).show()

+-------+-----+
|   Name|Marks|
+-------+-----+
| Pulkit|   82|
| Ritika|   94|
|Atirikt|   78|
| Reshav|   56|
+-------+-----+



# 4 

In [15]:
df.select(df["Name"],df["Marks"]).show()

+-------+-----+
|   Name|Marks|
+-------+-----+
| Pulkit|   82|
| Ritika|   94|
|Atirikt|   78|
| Reshav|   56|
+-------+-----+



# 5 We can use col() function from pyspark.sql.functions module to specify the particular columns

In [16]:
from pyspark.sql.functions import col

df.select(col("Name"),col("Marks")).show()

+-------+-----+
|   Name|Marks|
+-------+-----+
| Pulkit|   82|
| Ritika|   94|
|Atirikt|   78|
| Reshav|   56|
+-------+-----+



The `col()` function in PySpark is used to **refer to a column** in a DataFrame in a way that allows you to apply transformations or build expressions.

---

### 📌 Why use `col()`?

* It’s required when using **column expressions** in transformations like:

  * `select()`, `filter()`, `withColumn()`, etc.
* It gives **more flexibility** and is cleaner than using string column names directly.

---

### ✅ Basic Example

```python
from pyspark.sql.functions import col

df.select(col("name"), col("age") + 1).show()
```

* `col("age") + 1` adds 1 to each value in the `age` column.
* You can't write `"age" + 1` directly — that would throw an error.

---

### 🆚 Without `col()` — works in some cases, but limited

```python
df.select("name", "age").show()       # okay
df.select("age" + 1).show()           # ❌ error
```

With `col()` you can write expressions:

```python
df.select(col("age") + 1).show()      # ✅ works
```

---

### 🔧 Used with other functions:

```python
from pyspark.sql.functions import upper, when

df.select(
    upper(col("name")).alias("NAME_UPPER"),
    when(col("age") >= 18, "Adult").otherwise("Minor").alias("status")
)
```

---

### 🧠 Summary

| Function             | Description                                  |
| -------------------- | -------------------------------------------- |
| `col("column_name")` | Refers to a DataFrame column for expressions |
| Used in              | `select()`, `filter()`, `withColumn()`, etc. |

It’s a **standard way to refer to and manipulate columns** safely and expressively in Spark DataFrame operations.

Let me know if you want comparisons with `expr()` or `df["column"]` too!


In [17]:
spark.stop()