# What will be covered:
* PySpark DataFrame.
* Reading The Dataset.
* Checking the Datatypes of the Columns(Schema).
* Selecting Columns and Indexing.
* Check Describe option similar to Pandas.
* Adding Columns.
* Dropping columns.

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [4]:
spark

In [8]:
# Reading the dataset
reading = spark.read.option('header', 'true').csv('test1.csv')
reading.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [9]:
reading.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [10]:
reading = spark.read.option('header', 'true').csv('test1.csv', inferSchema=True)
reading.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
reading = spark.read.option('header', 'true').csv('test1.csv', header=True, inferSchema=True)
reading.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [14]:
type(reading)

pyspark.sql.dataframe.DataFrame

In [15]:
reading.columns

['Name', 'age', 'Experience']

In [17]:
reading.head(3)

[Row(Name='Krish', age=31, Experience=10),
 Row(Name='Samantha', age=30, Experience=8),
 Row(Name='Mosh', age=29, Experience=4)]

In [18]:
reading.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [19]:
reading.select("age")

DataFrame[age: int]

In [20]:
reading.select("age").show()

+---+
|age|
+---+
| 31|
| 30|
| 29|
+---+



In [21]:
reading.select(["age", "Name"]).show()

+---+--------+
|age|    Name|
+---+--------+
| 31|   Krish|
| 30|Samantha|
| 29|    Mosh|
+---+--------+



In [23]:
reading.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [25]:
reading.describe().show()

+-------+--------+----+-----------------+
|summary|    Name| age|       Experience|
+-------+--------+----+-----------------+
|  count|       3|   3|                3|
|   mean|    NULL|30.0|7.333333333333333|
| stddev|    NULL| 1.0|3.055050463303893|
|    min|   Krish|  29|                4|
|    max|Samantha|  31|               10|
+-------+--------+----+-----------------+



                                                                                

In [30]:
### Adding columns in DataFrame
rr = reading.withColumn("Experience after 2 year", reading['Experience']+2)

In [32]:
rr.show()

+--------+---+----------+-----------------------+
|    Name|age|Experience|Experience after 2 year|
+--------+---+----------+-----------------------+
|   Krish| 31|        10|                     12|
|Samantha| 30|         8|                     10|
|    Mosh| 29|         4|                      6|
+--------+---+----------+-----------------------+



In [37]:
### Dropping columns
rr_drop = rr.drop('Experience after 2 year')
rr_drop.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



In [None]:
### Rename the columns
reading.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Samantha| 30|         8|
|    Mosh| 29|         4|
+--------+---+----------+



24/06/13 02:20:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 262762 ms exceeds timeout 120000 ms
24/06/13 02:20:18 WARN SparkContext: Killing executors is not supported by current scheduler.
24/06/13 02:20:20 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o