In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("JupyterStandalo") \
    .master("spark://8fa087ac675c:7077") \
    .config("spark.executor.instances", "3") \
    .config("spark.executor.cores", "6") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/24 11:23:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### ✅ `withColumn()` in Spark

`withColumn()` is a method in PySpark DataFrame used to:

* **Add a new column**, or
* **Replace an existing column**
  by applying an **expression or transformation**.

---

### 📌 Syntax:

```python
df.withColumn("new_column_name", expression)
```

---

### 🔧 Examples

#### 1. **Add a new column by transforming an existing one**

```python
from pyspark.sql.functions import col

df.withColumn("age_plus_1", col("age") + 1).show()
```

#### 2. **Replace an existing column**

If the column already exists, it will be **overwritten**:

```python
df = df.withColumn("age", col("age") + 10)
```

---

### 🔁 Chaining `withColumn()` multiple times

```python
df = df.withColumn("age_plus_5", col("age") + 5) \
       .withColumn("is_adult", col("age") >= 18)
```

---

### ⚠️ Common mistakes

| Mistake                   | Fix                                    |
| ------------------------- | -------------------------------------- |
| Using `df.col("age")`     | Use `col("age")` or `df["age"]`        |
| Expecting in-place update | `withColumn()` returns a new DataFrame |

---

### 📚 Use Cases

* Creating flags or categories with `when()`, `otherwise()`
* Data normalization/scaling
* Type conversion
* Creating columns from other columns

Example with `when()`:

```python
from pyspark.sql.functions import when

df.withColumn("status", when(col("age") >= 18, "Adult").otherwise("Minor")).show()
```

---

Let me know if you'd like examples with `UDF`, conditionals, or string operations!


In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
# Initialize our data
data2 = [("Pulkit", 12, "CS32", 82, "Programming"),
         ("Ritika", 20, "CS32", 94, "Writing"),
         ("Atirikt", 4, "BB21", 78, None),
         ("Reshav", 18, None, 56, None)
         ]

# Start spark session

# Define schema
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Roll Number", IntegerType(), True),
    StructField("Class ID", StringType(), True),
    StructField("Marks", IntegerType(), True),
    StructField("Extracurricular", StringType(), True)
])

# read the dataframe
df = spark.createDataFrame(data=data2, schema=schema)

In [3]:
df.show()

                                                                                

+-------+-----------+--------+-----+---------------+
|   Name|Roll Number|Class ID|Marks|Extracurricular|
+-------+-----------+--------+-----+---------------+
| Pulkit|         12|    CS32|   82|    Programming|
| Ritika|         20|    CS32|   94|        Writing|
|Atirikt|          4|    BB21|   78|           NULL|
| Reshav|         18|    NULL|   56|           NULL|
+-------+-----------+--------+-----+---------------+



# with column le baki sabbai column jasta ko testai

but yes vitra define vako column herxa yedi xa vanae teslai modify natra naya column add 

note naya dtataframe retun hunxa hai 

tesai ma garu pare df=df.with.... garni

In [4]:
from pyspark.sql.functions import col, upper

df1 = df.withColumn("Class ID", upper(col("Name")))

In [5]:
df1.show()

25/06/24 11:29:19 ERROR TaskSchedulerImpl: Lost executor 1 on 172.20.0.3: Command exited with code 137
25/06/24 11:29:19 WARN TaskSetManager: Lost task 3.0 in stage 4.0 (TID 22) (172.20.0.3 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Command exited with code 137
25/06/24 11:29:21 ERROR TaskSchedulerImpl: Lost executor 2 on 172.20.0.2: Command exited with code 137
25/06/24 11:29:21 WARN TaskSetManager: Lost task 3.1 in stage 4.0 (TID 23) (172.20.0.2 executor 2): ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: Command exited with code 137
                                                                                

+-------+-----------+--------+-----+---------------+
|   Name|Roll Number|Class ID|Marks|Extracurricular|
+-------+-----------+--------+-----+---------------+
| Pulkit|         12|  PULKIT|   82|    Programming|
| Ritika|         20|  RITIKA|   94|        Writing|
|Atirikt|          4| ATIRIKT|   78|           NULL|
| Reshav|         18|  RESHAV|   56|           NULL|
+-------+-----------+--------+-----+---------------+



25/06/24 11:29:50 WARN TransportChannelHandler: Exception in connection from /172.20.0.4:37288
java.net.SocketException: Connection reset
	at java.base/sun.nio.ch.SocketChannelImpl.throwConnectionReset(SocketChannelImpl.java:394)
	at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:426)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:255)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:356)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:796)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:732)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:658)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562)
	at io.netty.util.concurrent.SingleThreadEventExe

In [6]:
# yesma when tyo case wala chai garna mildo raixa like

In [8]:
from pyspark.sql.functions import when, col

df.withColumn("status", when(col("Marks") >= 80, "Good").otherwise("Bad")).show()


                                                                                

+-------+-----------+--------+-----+---------------+------+
|   Name|Roll Number|Class ID|Marks|Extracurricular|status|
+-------+-----------+--------+-----+---------------+------+
| Pulkit|         12|    CS32|   82|    Programming|  Good|
| Ritika|         20|    CS32|   94|        Writing|  Good|
|Atirikt|          4|    BB21|   78|           NULL|   Bad|
| Reshav|         18|    NULL|   56|           NULL|   Bad|
+-------+-----------+--------+-----+---------------+------+



In [9]:
# but aaba complex logic lekhu pare udf lekhnu parxa

In [10]:
spark.stop()