In [1]:
sc

In [2]:
spark

In [7]:
sc.stop()

In [8]:
from pyspark import SparkConf, SparkContext
#setMaster() = Set Spark Content Manager which is local[cpu cores]
config = SparkConf().setMaster('local[2]').setAppName("AdvancedSession")
sc = SparkContext(conf = config)

In [9]:
sc

In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SQLSession').getOrCreate()

In [11]:
people_df = spark.read.json("file:///home/hadoop/Downloads/People.json")

In [14]:
people_df.show(3)

+---------+-----------+----------+------+---+---------+------+
|     city|    country|first_name|gender| id|last_name|salary|
+---------+-----------+----------+------+---+---------+------+
|Mulyosari|  Indonesia|     Valma|Female|  1|     Sans|983107|
|  Niihama|      Japan|     Paolo|  Male|  2|   Kiddie|649173|
|Dū Qal‘ah|Afghanistan|    Miltie|  Male|  3| De Zuani|352898|
+---------+-----------+----------+------+---+---------+------+
only showing top 3 rows



In [13]:
people_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: long (nullable = true)



#### 1.Create a User Defined Schema for fields of DataFrame.

In [21]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType , StringType , StructType, StructField

In [47]:
schema  = StructType([
    StructField("id",IntegerType(),True),
    StructField("first_name",StringType(),True),
    StructField("last_name",StringType(),True),
    StructField("gender",StringType(),True),
    StructField("salary",IntegerType(),True),
    StructField("city",StringType(),True),
    StructField("country",StringType(),True)    
])

In [48]:
people_df = spark.read.schema(schema).json("file:///home/hadoop/Downloads/People.json")

In [49]:
people_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



In [50]:
people_df.show(10)

+---+----------+---------+------+------+------------------+------------+
| id|first_name|last_name|gender|salary|              city|     country|
+---+----------+---------+------+------+------------------+------------+
|  1|     Valma|     Sans|Female|983107|         Mulyosari|   Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|           Niihama|       Japan|
|  3|    Miltie| De Zuani|  Male|352898|         Dū Qal‘ah| Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|            Iberia|        Peru|
|  5| Reinaldos|   Keeffe|  Male|440989|          La Ronge|      Canada|
|  6|        Eb|Schwanden|  Male|274126|      Kuala Lumpur|    Malaysia|
|  7|    Alleyn|   Paddon|  Male|681914|         Al Qurayn|Saudi Arabia|
|  8|   Baryram|     Yell|  Male|250748|           Jixiang|       China|
|  9|     Cammy|     Axel|Female|221750|Thị Trấn Phong Thổ|     Vietnam|
| 10|       Erl|  Caldera|  Male|680801|        Kotatengah|   Indonesia|
+---+----------+---------+------+------+-----------

In [31]:
#The data in bank_edited.json is ararnged as list of dictionaries where each key-value pair is in a new line.
#This makes an error. So we use "multiLine".
bank_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json", multiLine =True)
bank_data.show(10)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [32]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



#### 2.Typecasting Any one column from Long to Int

In [66]:
#Any individual column transformation can be only done with withColumn()
#withColumn("newcolname",col())
bank_data.withColumn("age", col("age").cast(IntegerType()))  #Type cast from long to int

DataFrame[age: int, balance: bigint, campaign: bigint, contact: string, day: bigint, default: string, duration: bigint, education: string, housing: string, job: string, loan: string, marital: string, month: string, pdays: bigint, poutcome: string, previous: bigint, y: string]

#### 3.Creating new column from two strings

In [56]:
from pyspark.sql.functions import concat
people_df = people_df.withColumn("Full_Name",concat(col("first_name"),lit(" "),col("last_name")))

In [41]:
people_df.show(3)

+---+----------+---------+------+---------+-----------+---------------+
| id|first_name|last_name|gender|     city|    country|      Full_Name|
+---+----------+---------+------+---------+-----------+---------------+
|  1|     Valma|     Sans|Female|Mulyosari|  Indonesia|     Valma Sans|
|  2|     Paolo|   Kiddie|  Male|  Niihama|      Japan|   Paolo Kiddie|
|  3|    Miltie| De Zuani|  Male|Dū Qal‘ah|Afghanistan|Miltie De Zuani|
+---+----------+---------+------+---------+-----------+---------------+
only showing top 3 rows



#### 4.Renaming existing column

In [51]:
people_df = people_df.withColumnRenamed("salary","income")

In [52]:
people_df.show(3)

+---+----------+---------+------+------+---------+-----------+
| id|first_name|last_name|gender|income|     city|    country|
+---+----------+---------+------+------+---------+-----------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan|
+---+----------+---------+------+------+---------+-----------+
only showing top 3 rows



#### 5.limit()

In [46]:
people_df.limit(5).show()

+---+----------+---------+------+---------+-----------+----------------+
| id|first_name|last_name|gender|     city|    Country|       Full_Name|
+---+----------+---------+------+---------+-----------+----------------+
|  1|     Valma|     Sans|Female|Mulyosari|  Indonesia|      Valma Sans|
|  2|     Paolo|   Kiddie|  Male|  Niihama|      Japan|    Paolo Kiddie|
|  3|    Miltie| De Zuani|  Male|Dū Qal‘ah|Afghanistan| Miltie De Zuani|
|  4|    Jarrid| Dalziell|  Male|   Iberia|       Peru| Jarrid Dalziell|
|  5| Reinaldos|   Keeffe|  Male| La Ronge|     Canada|Reinaldos Keeffe|
+---+----------+---------+------+---------+-----------+----------------+



#### 6. OrderBy()
    *Arrange data in ascending & descending order

In [53]:
#Order by a single column
people_df.orderBy(['income'],ascending = True).show(5)

+---+----------+---------+------+------+------------+---------+
| id|first_name|last_name|gender|income|        city|  country|
+---+----------+---------+------+------+------------+---------+
| 93|      Cory|    Prigg|  Male| 12876|     Gondang|Indonesia|
|590|      Flem| Tumielli|  Male| 13347| Debre Zeyit| Ethiopia|
|192|       Odo|  Conyers|  Male| 15555|  Raffingora| Zimbabwe|
|407|  Barbabas|Ballingal|  Male| 18598|Beringinjaya|Indonesia|
|297|     Daron|   Melato|Female| 19881|      Phayao| Thailand|
+---+----------+---------+------+------+------------+---------+
only showing top 5 rows



In [57]:
#Ordering by multiple columns
people_df.orderBy(['country','income'],ascending = [True,False]).show()

+---+----------+------------+------+------+------------------+--------------+-------------------+
| id|first_name|   last_name|gender|income|              city|       country|          Full_Name|
+---+----------+------------+------+------+------------------+--------------+-------------------+
|490|  Cathlene|    Gatfield|Female|981605|           Mīrābād|   Afghanistan|  Cathlene Gatfield|
|448|      Yuri|     Duggary|  Male|414107|     Sang-e Māshah|   Afghanistan|       Yuri Duggary|
|  3|    Miltie|    De Zuani|  Male|352898|         Dū Qal‘ah|   Afghanistan|    Miltie De Zuani|
|155|    Guntar|    Langmuir|  Male|290613|             Khōst|   Afghanistan|    Guntar Langmuir|
|983|      Tiff|     Dreakin|Female|208548|             Āsmār|   Afghanistan|       Tiff Dreakin|
|290|     Myles|      Britch|  Male|191508|         Dū Laīnah|   Afghanistan|       Myles Britch|
|419|   Ezekiel|   Fleetwood|  Male|163113|      Barakī Barak|   Afghanistan|  Ezekiel Fleetwood|
|701|    Gerrie|    

#### Materialized view
    *createOrReplaceTempView()
    *To avoid querying the main data

In [74]:
bank_data.createOrReplaceTempView('bankdata')

In [60]:
spark.sql('select * from bankdata').show(5)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan|marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no|married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no| single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes|married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no|married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown|  no| si

In [70]:
spark.sql('select count(*) as count from bankdata').show()

+-----+
|count|
+-----+
|45211|
+-----+



In [68]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



  * Show top 10 youngest employee with Maximum Balance

In [78]:
spark.sql('select age, balance from bankdata order by age asc ,balance desc').show(10)

+---+-------+
|age|balance|
+---+-------+
| 18|   1944|
| 18|    608|
| 18|    608|
| 18|    438|
| 18|    348|
| 18|    156|
| 18|    108|
| 18|    108|
| 18|    108|
| 18|     35|
+---+-------+
only showing top 10 rows



In [85]:
spark.sql('select age, max(balance) as max_balance from bankdata group by age order by age asc ').show(10)

+---+-----------+
|age|max_balance|
+---+-----------+
| 18|       1944|
| 19|       5368|
| 20|       8860|
| 21|       8278|
| 22|      10971|
| 23|      19690|
| 24|      23878|
| 25|      16874|
| 26|      24299|
| 27|      24025|
+---+-----------+
only showing top 10 rows



 * Show the worst 5 job having minimum salary 

In [87]:
spark.sql('select job, min(balance) as salary from bankdata group by job order by salary asc ').show(5)

+-------------+------+
|          job|salary|
+-------------+------+
|  blue-collar| -8019|
|   management| -6847|
|self-employed| -3313|
|   technician| -2827|
|     services| -2122|
+-------------+------+
only showing top 5 rows

