# PySpark Dataframes. 

In [1]:
# Importing the Spark Session. 
from pyspark.sql import SparkSession

In [2]:
# Initialising the Spark Session. 
spark = SparkSession.builder.appName('Dataframes').getOrCreate()

In [3]:
# General Info About the Session.
spark

## Reading a Spark Dataframe

In [4]:
# Read the Dataset. 
df_pyspark = spark.read.option('header', 'true').csv('test1.csv', inferSchema = True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [5]:
# Check the Schema. 
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
# Alternative Read the Dataset. 
df_pyspark = spark.read.csv('test1.csv', header = True, inferSchema = True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



## Indexing a Spark Datafrma

In [7]:
# Show the Columns. 
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

In [8]:
# Select Head Rows
df_pyspark.head(3)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [9]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [10]:
# Select an Specific Column
df_pyspark.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [11]:
# Select Multiple Columns. 
df_pyspark.select(['Name', 'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [12]:
# Check the Data Types. 
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

## Basic Data Exploration. 

In [13]:
# Describing the Data.
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [14]:
# Adding a Column with Data. 
df_pyspark = df_pyspark.withColumn('Experience A2Y', df_pyspark['Experience']+2)

In [15]:
df_pyspark.show()

+---------+---+----------+------+--------------+
|     Name|age|Experience|Salary|Experience A2Y|
+---------+---+----------+------+--------------+
|    Krish| 31|        10| 30000|            12|
|Sudhanshu| 30|         8| 25000|            10|
|    Sunny| 29|         4| 20000|             6|
|     Paul| 24|         3| 20000|             5|
|   Harsha| 21|         1| 15000|             3|
|  Shubham| 23|         2| 18000|             4|
+---------+---+----------+------+--------------+



In [16]:
# Drop a Column. 
df_pyspark = df_pyspark.drop('Experience A2Y')

In [17]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [18]:
# Rename a Column. 
df_pyspark = df_pyspark.withColumnRenamed('Name', 'First Name')

In [19]:
df_pyspark.show()

+----------+---+----------+------+
|First Name|age|Experience|Salary|
+----------+---+----------+------+
|     Krish| 31|        10| 30000|
| Sudhanshu| 30|         8| 25000|
|     Sunny| 29|         4| 20000|
|      Paul| 24|         3| 20000|
|    Harsha| 21|         1| 15000|
|   Shubham| 23|         2| 18000|
+----------+---+----------+------+

