Importing PySpark

In [1]:
import pyspark

Initializing Session

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [7]:
spark

Loading Data

In [61]:
df=spark.read.option("header","true").csv("../data/test1.csv",inferSchema=True)

about Data

In [62]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [63]:
df.columns

['Name', 'Age', 'Experience']

In [64]:
df.head(3)

[Row(Name='Priyanshu Joshi', Age=45, Experience=15),
 Row(Name='Ayush Nagar', Age=30, Experience=5),
 Row(Name='Harsh Kumar Shakya', Age=28, Experience=11)]

In [65]:
df.show()

+-------------------+---+----------+
|               Name|Age|Experience|
+-------------------+---+----------+
|    Priyanshu Joshi| 45|        15|
|        Ayush Nagar| 30|         5|
| Harsh Kumar Shakya| 28|        11|
|    Krishna Agrawal| 32|         2|
|Asmit Singh Mandoor| 42|        16|
+-------------------+---+----------+



In [66]:
df.select("Name").show()

+-------------------+
|               Name|
+-------------------+
|    Priyanshu Joshi|
|        Ayush Nagar|
| Harsh Kumar Shakya|
|    Krishna Agrawal|
|Asmit Singh Mandoor|
+-------------------+



In [67]:
df.select(["Age","Experience"]).show()

+---+----------+
|Age|Experience|
+---+----------+
| 45|        15|
| 30|         5|
| 28|        11|
| 32|         2|
| 42|        16|
+---+----------+



In [68]:
df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [69]:
df.describe().show()

+-------+-------------------+-----------------+---------------+
|summary|               Name|              Age|     Experience|
+-------+-------------------+-----------------+---------------+
|  count|                  5|                5|              5|
|   mean|               NULL|             35.4|            9.8|
| stddev|               NULL|7.602631123499285|6.1400325732035|
|    min|Asmit Singh Mandoor|               28|              2|
|    max|    Priyanshu Joshi|               45|             16|
+-------+-------------------+-----------------+---------------+



Adding Column

In [72]:
df=df.withColumn("Starting of Career",df["Age"]-df["Experience"])
df.show()

+-------------------+---+----------+------------------+
|               Name|Age|Experience|Starting of Career|
+-------------------+---+----------+------------------+
|    Priyanshu Joshi| 45|        15|                30|
|        Ayush Nagar| 30|         5|                25|
| Harsh Kumar Shakya| 28|        11|                17|
|    Krishna Agrawal| 32|         2|                30|
|Asmit Singh Mandoor| 42|        16|                26|
+-------------------+---+----------+------------------+



In [73]:
from pyspark.sql.functions import lit

df=df.withColumn("Null", lit(None))
df.show()

+-------------------+---+----------+------------------+----+
|               Name|Age|Experience|Starting of Career|Null|
+-------------------+---+----------+------------------+----+
|    Priyanshu Joshi| 45|        15|                30|NULL|
|        Ayush Nagar| 30|         5|                25|NULL|
| Harsh Kumar Shakya| 28|        11|                17|NULL|
|    Krishna Agrawal| 32|         2|                30|NULL|
|Asmit Singh Mandoor| 42|        16|                26|NULL|
+-------------------+---+----------+------------------+----+



New info: Df of pyspark unlike pandas are immutable

In [74]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Starting of Career: integer (nullable = true)
 |-- Null: void (nullable = true)



In [75]:
df=df.withColumnRenamed("Null","Literally Null")
df.show()

+-------------------+---+----------+------------------+--------------+
|               Name|Age|Experience|Starting of Career|Literally Null|
+-------------------+---+----------+------------------+--------------+
|    Priyanshu Joshi| 45|        15|                30|          NULL|
|        Ayush Nagar| 30|         5|                25|          NULL|
| Harsh Kumar Shakya| 28|        11|                17|          NULL|
|    Krishna Agrawal| 32|         2|                30|          NULL|
|Asmit Singh Mandoor| 42|        16|                26|          NULL|
+-------------------+---+----------+------------------+--------------+



In [82]:
df=df.drop("Literally Null")
df.show()

+-------------------+---+----------+------------------+
|               Name|Age|Experience|Starting of Career|
+-------------------+---+----------+------------------+
|    Priyanshu Joshi| 45|        15|                30|
|        Ayush Nagar| 30|         5|                25|
| Harsh Kumar Shakya| 28|        11|                17|
|    Krishna Agrawal| 32|         2|                30|
|Asmit Singh Mandoor| 42|        16|                26|
+-------------------+---+----------+------------------+



In [83]:
spark.stop()