## This Notebook covers -
- PySpark Dataframes
- Reading the dataset
- Checking the datatypes
- Selecting Columns and Indexing
- Check Describe option similar to Pandas
- Adding Columns
- Dropping Columns

In [20]:
from pyspark.sql import SparkSession

In [21]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [22]:
spark
# In local only 1 master node

In [23]:
## Read the dataset - Method - 1
df_pyspark = spark.read.option('header','true').csv('E:\Programming Career\Pyspark\Pyspark-Introduction\Dataset\Sample_data.csv')

In [24]:
df_pyspark.show()

+----------+---+------+-----------+
|      Name|Age|Gender|      State|
+----------+---+------+-----------+
|   Ramudas| 27|     M|  Karnataka|
|   Saturup| 27|     M|  Karnataka|
|     Amita| 31|     F|West Bengal|
|     Deena| 27|     F|  Karnataka|
|     Sanui| 28|     M|Maharashtra|
|    Tamaka| 26|     F|Maharashtra|
|   Samurai| 25|     F|Maharashtra|
|Shishimanu| 28|     M|West Bengal|
+----------+---+------+-----------+



In [25]:
## Check Schema
df_pyspark.printSchema()
## Schema not inferred

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- State: string (nullable = true)



In [26]:
## Read the dataset
df_pyspark = spark.read.option('header','true').csv('E:\Programming Career\Pyspark\Pyspark-Introduction\Dataset\Sample_data.csv', inferSchema = True)

In [27]:
## Check Schema
df_pyspark.printSchema()
## Schema inferred

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- State: string (nullable = true)



In [28]:
## Read data - Method - 2
df_pyspark = spark.read.csv('E:\Programming Career\Pyspark\Pyspark-Introduction\Dataset\Sample_data.csv', header = True, inferSchema = True)
df_pyspark.show()

+----------+---+------+-----------+
|      Name|Age|Gender|      State|
+----------+---+------+-----------+
|   Ramudas| 27|     M|  Karnataka|
|   Saturup| 27|     M|  Karnataka|
|     Amita| 31|     F|West Bengal|
|     Deena| 27|     F|  Karnataka|
|     Sanui| 28|     M|Maharashtra|
|    Tamaka| 26|     F|Maharashtra|
|   Samurai| 25|     F|Maharashtra|
|Shishimanu| 28|     M|West Bengal|
+----------+---+------+-----------+



In [29]:
## Check Schema
df_pyspark.printSchema()
## Schema inferred

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- State: string (nullable = true)



In [30]:
type(df_pyspark)
#Type : pyspark dataframe

pyspark.sql.dataframe.DataFrame

In [31]:
#Get column names
df_pyspark.columns

['Name', 'Age', 'Gender', 'State']

In [32]:
#Get column names
df_pyspark.head()

Row(Name='Ramudas', Age=27, Gender='M', State='Karnataka')

In [33]:
## Selecting a column
df_pyspark.select('Name').show()

#Type
type(df_pyspark.select('Name'))

+----------+
|      Name|
+----------+
|   Ramudas|
|   Saturup|
|     Amita|
|     Deena|
|     Sanui|
|    Tamaka|
|   Samurai|
|Shishimanu|
+----------+



pyspark.sql.dataframe.DataFrame

In [34]:
## Selecting multiple column
df_pyspark.select(['Name','Age']).show()

+----------+---+
|      Name|Age|
+----------+---+
|   Ramudas| 27|
|   Saturup| 27|
|     Amita| 31|
|     Deena| 27|
|     Sanui| 28|
|    Tamaka| 26|
|   Samurai| 25|
|Shishimanu| 28|
+----------+---+



In [35]:
## Just returning column
df_pyspark['Name']

Column<'Name'>

In [36]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Gender', 'string'), ('State', 'string')]

In [37]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Gender: string, State: string]

In [38]:
df_pyspark.describe().show()
## String min and max based on alphabetical order

+-------+------+------------------+------+-----------+
|summary|  Name|               Age|Gender|      State|
+-------+------+------------------+------+-----------+
|  count|     8|                 8|     8|          8|
|   mean|  null|            27.375|  null|       null|
| stddev|  null|1.7677669529663689|  null|       null|
|    min| Amita|                25|     F|  Karnataka|
|    max|Tamaka|                31|     M|West Bengal|
+-------+------+------------------+------+-----------+



In [39]:
## Adding a column
df_pyspark.withColumn('Age after 5 years',df_pyspark['Age'] + 5)

DataFrame[Name: string, Age: int, Gender: string, State: string, Age after 5 years: int]

In [40]:
## Showing added columns
## Adding a column
df_pyspark.withColumn('Age after 5 years',df_pyspark['Age'] + 5).show()

+----------+---+------+-----------+-----------------+
|      Name|Age|Gender|      State|Age after 5 years|
+----------+---+------+-----------+-----------------+
|   Ramudas| 27|     M|  Karnataka|               32|
|   Saturup| 27|     M|  Karnataka|               32|
|     Amita| 31|     F|West Bengal|               36|
|     Deena| 27|     F|  Karnataka|               32|
|     Sanui| 28|     M|Maharashtra|               33|
|    Tamaka| 26|     F|Maharashtra|               31|
|   Samurai| 25|     F|Maharashtra|               30|
|Shishimanu| 28|     M|West Bengal|               33|
+----------+---+------+-----------+-----------------+



In [41]:
df_pyspark.show()
#Not an inplace operation, needs to  be assigned to a dataframe variable

+----------+---+------+-----------+
|      Name|Age|Gender|      State|
+----------+---+------+-----------+
|   Ramudas| 27|     M|  Karnataka|
|   Saturup| 27|     M|  Karnataka|
|     Amita| 31|     F|West Bengal|
|     Deena| 27|     F|  Karnataka|
|     Sanui| 28|     M|Maharashtra|
|    Tamaka| 26|     F|Maharashtra|
|   Samurai| 25|     F|Maharashtra|
|Shishimanu| 28|     M|West Bengal|
+----------+---+------+-----------+



In [42]:
df_pyspark = df_pyspark.withColumn('Age after 5 years',df_pyspark['Age'] + 5)

In [43]:
df_pyspark.show()

+----------+---+------+-----------+-----------------+
|      Name|Age|Gender|      State|Age after 5 years|
+----------+---+------+-----------+-----------------+
|   Ramudas| 27|     M|  Karnataka|               32|
|   Saturup| 27|     M|  Karnataka|               32|
|     Amita| 31|     F|West Bengal|               36|
|     Deena| 27|     F|  Karnataka|               32|
|     Sanui| 28|     M|Maharashtra|               33|
|    Tamaka| 26|     F|Maharashtra|               31|
|   Samurai| 25|     F|Maharashtra|               30|
|Shishimanu| 28|     M|West Bengal|               33|
+----------+---+------+-----------+-----------------+



In [44]:
df_pyspark.drop('Age after 5 years').show()
## Again not an inplace operation need to assign to a dataframe variable

+----------+---+------+-----------+
|      Name|Age|Gender|      State|
+----------+---+------+-----------+
|   Ramudas| 27|     M|  Karnataka|
|   Saturup| 27|     M|  Karnataka|
|     Amita| 31|     F|West Bengal|
|     Deena| 27|     F|  Karnataka|
|     Sanui| 28|     M|Maharashtra|
|    Tamaka| 26|     F|Maharashtra|
|   Samurai| 25|     F|Maharashtra|
|Shishimanu| 28|     M|West Bengal|
+----------+---+------+-----------+



In [45]:
df_pyspark = df_pyspark.drop('Age after 5 years')
## Assigning to a dataframe variable
df_pyspark.show()

+----------+---+------+-----------+
|      Name|Age|Gender|      State|
+----------+---+------+-----------+
|   Ramudas| 27|     M|  Karnataka|
|   Saturup| 27|     M|  Karnataka|
|     Amita| 31|     F|West Bengal|
|     Deena| 27|     F|  Karnataka|
|     Sanui| 28|     M|Maharashtra|
|    Tamaka| 26|     F|Maharashtra|
|   Samurai| 25|     F|Maharashtra|
|Shishimanu| 28|     M|West Bengal|
+----------+---+------+-----------+



In [46]:
## Renaming column
df_pyspark.withColumnRenamed('Gender','Sex').show()
## Not an inplace operation

+----------+---+---+-----------+
|      Name|Age|Sex|      State|
+----------+---+---+-----------+
|   Ramudas| 27|  M|  Karnataka|
|   Saturup| 27|  M|  Karnataka|
|     Amita| 31|  F|West Bengal|
|     Deena| 27|  F|  Karnataka|
|     Sanui| 28|  M|Maharashtra|
|    Tamaka| 26|  F|Maharashtra|
|   Samurai| 25|  F|Maharashtra|
|Shishimanu| 28|  M|West Bengal|
+----------+---+---+-----------+



In [47]:
df_pyspark = df_pyspark.withColumnRenamed('Gender','Sex')

In [48]:
df_pyspark.show()

+----------+---+---+-----------+
|      Name|Age|Sex|      State|
+----------+---+---+-----------+
|   Ramudas| 27|  M|  Karnataka|
|   Saturup| 27|  M|  Karnataka|
|     Amita| 31|  F|West Bengal|
|     Deena| 27|  F|  Karnataka|
|     Sanui| 28|  M|Maharashtra|
|    Tamaka| 26|  F|Maharashtra|
|   Samurai| 25|  F|Maharashtra|
|Shishimanu| 28|  M|West Bengal|
+----------+---+---+-----------+

