# What will be covered:
* PySpark DataFrame.
* Reading The Dataset.
* Checking the Datatypes of the Columns(Schema).
* Selecting Columns and Indexing.
* Check Describe option similar to Pandas.
* Adding Columns.
* Dropping columns.

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [4]:
spark

In [8]:
# Reading the dataset
reading = spark.read.option('header', 'true').csv('test1.csv')
reading.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [9]:
reading.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [10]:
reading = spark.read.option('header', 'true').csv('test1.csv', inferSchema=True)
reading.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
reading = spark.read.option('header', 'true').csv('test1.csv', header=True, inferSchema=True)
reading.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [14]:
type(reading)

pyspark.sql.dataframe.DataFrame

In [15]:
reading.columns

['Name', 'age', 'Experience']

In [17]:
reading.head(3)

[Row(Name='Krish', age=31, Experience=10),
 Row(Name='Samantha', age=30, Experience=8),
 Row(Name='Mosh', age=29, Experience=4)]

In [18]:
reading.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [19]:
reading.select("age")

DataFrame[age: int]

In [20]:
reading.select("age").show()

+---+
|age|
+---+
| 31|
| 30|
| 29|
+---+



In [21]:
reading.select(["age", "Name"]).show()

+---+--------+
|age|    Name|
+---+--------+
| 31|   Krish|
| 30|Samantha|
| 29|    Mosh|
+---+--------+



In [23]:
reading.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [25]:
reading.describe().show()

+-------+--------+----+-----------------+
|summary|    Name| age|       Experience|
+-------+--------+----+-----------------+
|  count|       3|   3|                3|
|   mean|    NULL|30.0|7.333333333333333|
| stddev|    NULL| 1.0|3.055050463303893|
|    min|   Krish|  29|                4|
|    max|Samantha|  31|               10|
+-------+--------+----+-----------------+



                                                                                

In [30]:
### Adding columns in DataFrame
rr = reading.withColumn("Experience after 2 year", reading['Experience']+2)

In [32]:
rr.show()

+--------+---+----------+-----------------------+
|    Name|age|Experience|Experience after 2 year|
+--------+---+----------+-----------------------+
|   Krish| 31|        10|                     12|
|Samantha| 30|         8|                     10|
|    Mosh| 29|         4|                      6|
+--------+---+----------+-----------------------+



In [37]:
### Dropping columns
rr_drop = rr.drop('Experience after 2 year')
rr_drop.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [None]:
### Rename the columns
reading.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



24/06/13 02:20:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 262762 ms exceeds timeout 120000 ms
24/06/13 02:20:18 WARN SparkContext: Killing executors is not supported by current scheduler.
24/06/13 02:20:20 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test").getOrCreate()

In [4]:
## Creating a dataframe
df = spark.createDataFrame([('Silas', 10),('Matthew',11),('John',9)],['name','age'])
df.show()

                                                                                

+-------+---+
|   name|age|
+-------+---+
|  Silas| 10|
|Matthew| 11|
|   John|  9|
+-------+---+



# Adding to the DataFrame

### Creating DataFrame

In [6]:
from datetime import date, datetime
print(datetime.now())

print(f"{datetime.now().strftime("%H:%M:%S")}")

2024-06-26 12:01:17.326852
12:01:17


In [43]:
from datetime import datetime, date, timedelta
import pandas as pd
from pyspark import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("example").getOrCreate()
df = spark.createDataFrame([
    Row(a=1, b=2, c="Boy", d=f"{datetime.now().strftime("%H:%M:%S")}", e=date(2024,11,24)),
    Row(a=3, b=6, c="Bike", d=f"{(datetime.now()+timedelta(hours=5)).strftime("%H:%M:%S")}", e=date(2023,3,24)),
    Row(a=2, b=8, c="Hen", d=f"{(datetime.now()-timedelta(minutes=30)).strftime("%H:%M:%S")}", e=date(1998,11,19))
])

In [33]:
df.show()
df.printSchema()

+---+---+----+--------+----------+
|  a|  b|   c|       d|         e|
+---+---+----+--------+----------+
|  1|  2| Boy|14:33:47|2024-11-24|
|  3|  6|Bike|19:33:47|2023-03-24|
|  2|  8| Boy|14:03:47|1998-11-19|
+---+---+----+--------+----------+

root
 |-- a: long (nullable = true)
 |-- b: long (nullable = true)
 |-- c: string (nullable = true)
 |-- d: string (nullable = true)
 |-- e: date (nullable = true)



In [39]:
df.collect()

[Row(a=1, b=2, c='Boy', d='14:33:47', e=datetime.date(2024, 11, 24)),
 Row(a=3, b=6, c='Bike', d='19:33:47', e=datetime.date(2023, 3, 24)),
 Row(a=2, b=8, c='Boy', d='14:03:47', e=datetime.date(1998, 11, 19))]

### Selecting and Accessing Data

In [40]:
df.a

Column<'a'>

In [44]:
from pyspark.sql import Column
from pyspark.sql.functions import upper

type(df.c) == type(upper(df.c)) == type(df.c.isNull())

True

In [45]:
df.select(df.c).show()

+----+
|   c|
+----+
| Boy|
|Bike|
| Hen|
+----+



#### Adding A derived Column

In [47]:
# Assigning new column instance

df.withColumn("upper_c", upper(df.c)).show()

+---+---+----+--------+----------+-------+
|  a|  b|   c|       d|         e|upper_c|
+---+---+----+--------+----------+-------+
|  1|  2| Boy|14:35:30|2024-11-24|    BOY|
|  3|  6|Bike|19:35:30|2023-03-24|   BIKE|
|  2|  8| Hen|14:05:30|1998-11-19|    HEN|
+---+---+----+--------+----------+-------+



In [51]:
# Manipulating Numbers (Integers and Floats)
df.withColumn("Adding a and b", df.a+df.b).show()

+---+---+----+--------+----------+--------------+
|  a|  b|   c|       d|         e|Adding a and b|
+---+---+----+--------+----------+--------------+
|  1|  2| Boy|14:35:30|2024-11-24|             3|
|  3|  6|Bike|19:35:30|2023-03-24|             9|
|  2|  8| Hen|14:05:30|1998-11-19|            10|
+---+---+----+--------+----------+--------------+



In [54]:
# Manipulating Date 
df.withColumn("Adding more Date", df.e+timedelta(days=25)).show()

+---+---+----+--------+----------+-------------------+
|  a|  b|   c|       d|         e|   Adding more Date|
+---+---+----+--------+----------+-------------------+
|  1|  2| Boy|14:35:30|2024-11-24|2024-12-19 00:00:00|
|  3|  6|Bike|19:35:30|2023-03-24|2023-04-18 00:00:00|
|  2|  8| Hen|14:05:30|1998-11-19|1998-12-14 00:00:00|
+---+---+----+--------+----------+-------------------+



In [53]:
# Multiple Manipulations
df.withColumn("Adding more Date", df.e+timedelta(days=25)).withColumn("Adding a and b", df.a+df.b).show()

+---+---+----+--------+----------+-------------------+--------------+
|  a|  b|   c|       d|         e|   Adding more Date|Adding a and b|
+---+---+----+--------+----------+-------------------+--------------+
|  1|  2| Boy|14:35:30|2024-11-24|2024-12-19 00:00:00|             3|
|  3|  6|Bike|19:35:30|2023-03-24|2023-04-18 00:00:00|             9|
|  2|  8| Hen|14:05:30|1998-11-19|1998-12-14 00:00:00|            10|
+---+---+----+--------+----------+-------------------+--------------+



### Applying a Function

In [62]:
import pandas as pd
import pyspark
from pyspark.sql.functions import *


print(dir(pyspark.sql.functions))



In [None]:
import pandas as pd
from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 2

df.select(pandas_plus_one(df.a)).show()

In [74]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("Example")\
        .getOrCreate()


In [79]:
## Reading CSV file
df = spark.read.csv("test1.csv", inferSchema=True, header=True)
df.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [None]:
## Writing a CSV file
df.write.csv("Write_csv.csv", header=True)

### Working with SQL

In [82]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("Example")\
        .getOrCreate()

df = spark.read.csv("test1.csv", inferSchema=True, header=True)


In [None]:
df.createOrReplaceTempView("tableA")
spark.sql("SELECT COUNT(*) AS Total FROM tableA").show()

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("local").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/02 15:43:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.stop()

In [4]:
# Connecting to Spark Connect server
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()

### Building a Pyspark Application

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Creating a SparkSession
spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/02 16:17:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 51438)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socketserver.py", line 747, in __init__
    self.handle()
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/Library/Frameworks/Python.framew

In [2]:
sample_data = [{"name": "John    D.", "age": 30},
  {"name": "Alice   G.", "age": 25},
  {"name": "Bob  T.", "age": 35},
  {"name": "Eve   A.", "age": 28}]

df = spark.createDataFrame(sample_data)

In [3]:
df.show()

                                                                                

+---+----------+
|age|      name|
+---+----------+
| 30|John    D.|
| 25|Alice   G.|
| 35|   Bob  T.|
| 28|  Eve   A.|
+---+----------+



In [None]:
from pyspark.sql.functions import col, regexp_replace

# Removing additional spaces in name
def remove_extra_spaces(df, column_name):
    # Remove extra spaces from the specified column
    df_transforom