# This Tutorial will cover:
- PySpark Dataframes
- Reading Dataset
- Checking Datatypes of column (Schema)
- Secting columns and indexing
- Check describe option similar to pandas
- Add columns
- Dropp columns
- Renaming columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [7]:
# read dataset
spark.read.option('header', 'true').csv('test2.csv', sep=';')

DataFrame[Name: string, Age: string, Experience: string]

In [8]:
spark.read.option('header', 'true').csv('test2.csv', sep=';').show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|  Max| 30|        10|
|Abdel| 32|         8|
| Manu| 31|         4|
+-----+---+----------+



In [10]:
df_pyspark=spark.read.option('header', 'true').csv('test2.csv', sep=';')

In [20]:
df_pyspark.printSchema() # shows types of columns 

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
# inferSchema=True without this parameter all values will be considered as string
df_pyspark=spark.read.option('header', 'true').csv('test2.csv', sep=';', inferSchema=True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [17]:
df_pyspark=spark.read.csv('test2.csv', sep=';', header=True, inferSchema=True)
df_pyspark.show()
df_pyspark.printSchema()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|  Max| 30|        10|
|Abdel| 32|         8|
| Manu| 31|         4|
+-----+---+----------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [19]:
type(df_pyspark) # dataframes are data structures

pyspark.sql.dataframe.DataFrame

In [24]:
df_pyspark.head(3) # returns a list

[Row(Name='Max', Age=30, Experience=10),
 Row(Name='Abdel', Age=32, Experience=8),
 Row(Name='Manu', Age=31, Experience=4)]

In [28]:
df_pyspark.select('Name').show() # select column

+-----+
| Name|
+-----+
|  Max|
|Abdel|
| Manu|
+-----+



In [30]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [32]:
df_pyspark.select(['Name', 'Experience']).show() 

+-----+----------+
| Name|Experience|
+-----+----------+
|  Max|        10|
|Abdel|         8|
| Manu|         4|
+-----+----------+



In [34]:
df_pyspark['Name'] # to like pandas, need to use select

Column<'Name'>

In [36]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [37]:
df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| Age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| null|31.0|7.333333333333333|
| stddev| null| 1.0|3.055050463303893|
|    min|Abdel|  30|                4|
|    max|  Max|  32|               10|
+-------+-----+----+-----------------+



In [40]:
# Adding columns in data frame
df_pyspark=df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience']+2)

In [41]:
df_pyspark.show()

+-----+---+----------+------------------------+
| Name|Age|Experience|Experience after 2 years|
+-----+---+----------+------------------------+
|  Max| 30|        10|                      12|
|Abdel| 32|         8|                      10|
| Manu| 31|         4|                       6|
+-----+---+----------+------------------------+



In [42]:
# Drop column
df_pyspark=df_pyspark.drop('Experience after 2 years')
df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|  Max| 30|        10|
|Abdel| 32|         8|
| Manu| 31|         4|
+-----+---+----------+



In [44]:
# Rename column
df_pyspark=df_pyspark.withColumnRenamed('Name', 'New Name')
df_pyspark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|     Max| 30|        10|
|   Abdel| 32|         8|
|    Manu| 31|         4|
+--------+---+----------+

