# Pyspark fundamentals

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [3]:
# Read the json file people that is placed in same path as this notebook. Else specify full path
df = spark.read.json('people.json')
df.show()

+---+---------+------+--------+----------+
|age|firstName|gender|lastName|    number|
+---+---------+------+--------+----------+
| 28|      Joe|  male| Jackson|7349282382|
| 32|    James|  male|   Smith|5678568567|
| 24|    Emily|female|   Jones| 456754675|
+---+---------+------+--------+----------+



In [4]:
# printSchema is used to print the schema of the dataframe. It displays the datatype and if the field is nullable. 
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- number: string (nullable = true)



In [5]:
# Viewing the columns
df.columns

['age', 'firstName', 'gender', 'lastName', 'number']

In [6]:
#  describing the dataframe structure. describe() displays the datatype and field name
# as seen age is string here
df.describe()

DataFrame[summary: string, age: string, firstName: string, gender: string, lastName: string, number: string]

In [7]:
# describe() along with show() displays count, mean, stddev, min and max details. 
df.describe().show()

+-------+----+---------+------+--------+--------------------+
|summary| age|firstName|gender|lastName|              number|
+-------+----+---------+------+--------+--------------------+
|  count|   3|        3|     3|       3|                   3|
|   mean|28.0|     null|  null|    null| 4.494868541333333E9|
| stddev| 4.0|     null|  null|    null|3.5954963302739053E9|
|    min|  24|    Emily|female| Jackson|           456754675|
|    max|  32|      Joe|  male|   Smith|          7349282382|
+-------+----+---------+------+--------+--------------------+



### Creating user defined schema with datatypes. 

In [8]:
# Import the required methods from the package pyspark.sql.types
from pyspark.sql.types import (StructType, StructField, 
                               IntegerType, StringType)

In [9]:
# create a schema with the columns age, firstName, gender, lastName and number with required datatypes. 
# as seen age is defined as Integer now. 
data_schema = [StructField (('age'), IntegerType(), True ),
               StructField (('firstName'), StringType(), True ),
               StructField (('gender'), StringType(), True ),
               StructField (('lastName'), StringType(), True ),
               StructField (('number'), StringType(), True )]

In [10]:
# Once the schema is required, create the Structure Type and assign the data_schema that is created. 
final_struc = StructType(fields=data_schema)

In [11]:
# Reading the json file with the user defined file structure. 
df = spark.read.json('people.json', schema=final_struc)

In [12]:
# print the schema with userdefined structure. 
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- number: string (nullable = true)



In [13]:
# print the data
df.show()

+---+---------+------+--------+----------+
|age|firstName|gender|lastName|    number|
+---+---------+------+--------+----------+
| 28|      Joe|  male| Jackson|7349282382|
| 32|    James|  male|   Smith|5678568567|
| 24|    Emily|female|   Jones| 456754675|
+---+---------+------+--------+----------+



In [14]:
# df['age'] displays the Column but not exact data. 
print(df['age'])

# When the type() is displayed, we can see its column.Column
print(type(df['age']))

Column<'age'>
<class 'pyspark.sql.column.Column'>


In [15]:
# To display the data of any column, we need to use .select() function like SQL. 
df.select('age').show()

# type() would display its dataframe.Dataframe
print(type(df.select('age')))

+---+
|age|
+---+
| 28|
| 32|
| 24|
+---+

<class 'pyspark.sql.dataframe.DataFrame'>


In [16]:
# to display multiple columns, specify the column names within select()
df.select('age','gender').show()

+---+------+
|age|gender|
+---+------+
| 28|  male|
| 32|  male|
| 24|female|
+---+------+



In [17]:
# head() is used to Row data as RowData. 
# head(2) displays 2 records as list elements. 
# to select the 1st element from the list here, need to use the [0] and to select 2nd element, use [1], just like accessing list elements

print(df.head(2))
print('------------------------------------')
print(df.head(2)[0])
print('------------------------------------')
print(type((df.head(2)[0])))

[Row(age=28, firstName='Joe', gender='male', lastName='Jackson', number='7349282382'), Row(age=32, firstName='James', gender='male', lastName='Smith', number='5678568567')]
------------------------------------
Row(age=28, firstName='Joe', gender='male', lastName='Jackson', number='7349282382')
------------------------------------
<class 'pyspark.sql.types.Row'>


In [18]:
# to display first two entries in the dataframe use show(2)
df.show(2)

+---+---------+------+--------+----------+
|age|firstName|gender|lastName|    number|
+---+---------+------+--------+----------+
| 28|      Joe|  male| Jackson|7349282382|
| 32|    James|  male|   Smith|5678568567|
+---+---------+------+--------+----------+
only showing top 2 rows



In [19]:
# withColumn adds new column to dataframe. We are adding 'newAge' here with value as twice of 'age'
# !!!!!! Remember its temporary data unless it is assigned to some other dataFrame. !!!!!
df.withColumn('newAge',df['age']*2).show()

+---+---------+------+--------+----------+------+
|age|firstName|gender|lastName|    number|newAge|
+---+---------+------+--------+----------+------+
| 28|      Joe|  male| Jackson|7349282382|    56|
| 32|    James|  male|   Smith|5678568567|    64|
| 24|    Emily|female|   Jones| 456754675|    48|
+---+---------+------+--------+----------+------+



In [20]:
# As seen below, the new column newAge is not really added to dataframe df. 
df.show()

+---+---------+------+--------+----------+
|age|firstName|gender|lastName|    number|
+---+---------+------+--------+----------+
| 28|      Joe|  male| Jackson|7349282382|
| 32|    James|  male|   Smith|5678568567|
| 24|    Emily|female|   Jones| 456754675|
+---+---------+------+--------+----------+



In [21]:
# assigning the df with new column to df_new. 
df_new = df.withColumn('newAge',df['age']*2).show()
df_new

+---+---------+------+--------+----------+------+
|age|firstName|gender|lastName|    number|newAge|
+---+---------+------+--------+----------+------+
| 28|      Joe|  male| Jackson|7349282382|    56|
| 32|    James|  male|   Smith|5678568567|    64|
| 24|    Emily|female|   Jones| 456754675|    48|
+---+---------+------+--------+----------+------+



In [22]:
# withColumnRenamed is used to rename the existing column. 
# !!!!!! Remember its temporary data unless it is assigned to some other dataFrame. !!!!!
df.withColumnRenamed('age','new_age').show()

+-------+---------+------+--------+----------+
|new_age|firstName|gender|lastName|    number|
+-------+---------+------+--------+----------+
|     28|      Joe|  male| Jackson|7349282382|
|     32|    James|  male|   Smith|5678568567|
|     24|    Emily|female|   Jones| 456754675|
+-------+---------+------+--------+----------+



In [23]:
# As seen below, the new column newAge is not really added to dataframe df.
df.show()

+---+---------+------+--------+----------+
|age|firstName|gender|lastName|    number|
+---+---------+------+--------+----------+
| 28|      Joe|  male| Jackson|7349282382|
| 32|    James|  male|   Smith|5678568567|
| 24|    Emily|female|   Jones| 456754675|
+---+---------+------+--------+----------+



In [24]:
# assigning the df with new column to df_new. 
df_new1 = df.withColumnRenamed('age','new_age').show()
df_new1

+-------+---------+------+--------+----------+
|new_age|firstName|gender|lastName|    number|
+-------+---------+------+--------+----------+
|     28|      Joe|  male| Jackson|7349282382|
|     32|    James|  male|   Smith|5678568567|
|     24|    Emily|female|   Jones| 456754675|
+-------+---------+------+--------+----------+



### Creating the Temporary View with the dataframe to access like the DB2/SQL Table

In [25]:
# copying the dataframe data to Temporary View names 'people'
df.createOrReplaceTempView('people')

In [26]:
# the data from 'people' can be queried now like SQL Queries using SELECT as seen below
results = spark.sql("select * from people")
results.show()

+---+---------+------+--------+----------+
|age|firstName|gender|lastName|    number|
+---+---------+------+--------+----------+
| 28|      Joe|  male| Jackson|7349282382|
| 32|    James|  male|   Smith|5678568567|
| 24|    Emily|female|   Jones| 456754675|
+---+---------+------+--------+----------+



In [27]:
# SQL query can contain the conditions as well just like regular SQL queries using WHERE clause
results = spark.sql("select * from people where age > 25")
results.show()

+---+---------+------+--------+----------+
|age|firstName|gender|lastName|    number|
+---+---------+------+--------+----------+
| 28|      Joe|  male| Jackson|7349282382|
| 32|    James|  male|   Smith|5678568567|
+---+---------+------+--------+----------+

