In [1]:
from pyspark.sql import SparkSession
import os

# Create Spark session
spark = SparkSession.builder \
    .appName("simpleapp") \
    .master("local[*]") \
    .getOrCreate()

# Sample data
data = [("Metfula", 25, "Eswatini"),
        ("Elliot", 28, "Zimbabwe"),
        ("Aina", 30, "Namibia")]

# Create DataFrame
df = spark.createDataFrame(data, ["Name", "Age", "Country"])

# Show data
df.show()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/17 15:19:25 WARN Utils: Your hostname, brian-VirtualBox, resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/09/17 15:19:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/17 15:19:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-------+---+--------+
|   Name|Age| Country|
+-------+---+--------+
|Metfula| 25|Eswatini|
| Elliot| 28|Zimbabwe|
|   Aina| 30| Namibia|
+-------+---+--------+



In [2]:
df.printSchema()


root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Country: string (nullable = true)



## Example Two
- Creating a dataFarame using a list of lists

In [3]:
data = [
    ["James", "", "Smith", "1991-04-01", "M", 2000],
    ["Micheal", "",  "Rose", "2000-05-19", "M", 43000],
    ["Maria", "Happiness", "Williams", "2002-01-24", "F", 25000],
    ["Jen", "Promise", "Mary", "2012-02-06", "F", -30],
    ["Robert", "Ncedo", "Brown", "2007-12-13", "M", 0],
]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

dataFrame = spark.createDataFrame(data=data, schema=columns)
dataFrame.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [4]:
dataFrame.show()

                                                                                

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  2000|
|  Micheal|          |    Rose|2000-05-19|     M| 43000|
|    Maria| Happiness|Williams|2002-01-24|     F| 25000|
|      Jen|   Promise|    Mary|2012-02-06|     F|   -30|
|   Robert|     Ncedo|   Brown|2007-12-13|     M|     0|
+---------+----------+--------+----------+------+------+



In [5]:
dataFrame.withColumnRenamed("dob", "DateofBirth").printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- DateofBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [6]:
dataFrame.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  2000|
|  Micheal|          |    Rose|2000-05-19|     M| 43000|
|    Maria| Happiness|Williams|2002-01-24|     F| 25000|
|      Jen|   Promise|    Mary|2012-02-06|     F|   -30|
|   Robert|     Ncedo|   Brown|2007-12-13|     M|     0|
+---------+----------+--------+----------+------+------+



                                                                                

In [7]:
dataFrame.drop("gender").show()

                                                                                

+---------+----------+--------+----------+------+
|firstname|middlename|lastname|       dob|salary|
+---------+----------+--------+----------+------+
|    James|          |   Smith|1991-04-01|  2000|
|  Micheal|          |    Rose|2000-05-19| 43000|
|    Maria| Happiness|Williams|2002-01-24| 25000|
|      Jen|   Promise|    Mary|2012-02-06|   -30|
|   Robert|     Ncedo|   Brown|2007-12-13|     0|
+---------+----------+--------+----------+------+



In [8]:
dataFrame.show(truncate=False)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |2000  |
|Micheal  |          |Rose    |2000-05-19|M     |43000 |
|Maria    |Happiness |Williams|2002-01-24|F     |25000 |
|Jen      |Promise   |Mary    |2012-02-06|F     |-30   |
|Robert   |Ncedo     |Brown   |2007-12-13|M     |0     |
+---------+----------+--------+----------+------+------+



In [9]:
from pyspark.sql.functions import lit, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType



In [10]:
dataFrame = dataFrame.withColumn("salary", col("salary").cast("Integer")) # change the data type to integer
dataFrame = dataFrame.withColumn("salary", col("salary")*100) # multiply each value by a 100
dataFrame.show()

+---------+----------+--------+----------+------+-------+
|firstname|middlename|lastname|       dob|gender| salary|
+---------+----------+--------+----------+------+-------+
|    James|          |   Smith|1991-04-01|     M| 200000|
|  Micheal|          |    Rose|2000-05-19|     M|4300000|
|    Maria| Happiness|Williams|2002-01-24|     F|2500000|
|      Jen|   Promise|    Mary|2012-02-06|     F|  -3000|
|   Robert|     Ncedo|   Brown|2007-12-13|     M|      0|
+---------+----------+--------+----------+------+-------+



In [11]:
dataFrame.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [12]:

dataFrame = dataFrame\
    .withColumn("country", lit("Eswatini"))\
    .withColumn("region", lit("Manzini"))
dataFrame.show(truncate=False)

+---------+----------+--------+----------+------+-------+--------+-------+
|firstname|middlename|lastname|dob       |gender|salary |country |region |
+---------+----------+--------+----------+------+-------+--------+-------+
|James    |          |Smith   |1991-04-01|M     |200000 |Eswatini|Manzini|
|Micheal  |          |Rose    |2000-05-19|M     |4300000|Eswatini|Manzini|
|Maria    |Happiness |Williams|2002-01-24|F     |2500000|Eswatini|Manzini|
|Jen      |Promise   |Mary    |2012-02-06|F     |-3000  |Eswatini|Manzini|
|Robert   |Ncedo     |Brown   |2007-12-13|M     |0      |Eswatini|Manzini|
+---------+----------+--------+----------+------+-------+--------+-------+



                                                                                

In [13]:
dataFrame = dataFrame.drop("region")
dataFrame.show(truncate=False)

                                                                                

+---------+----------+--------+----------+------+-------+--------+
|firstname|middlename|lastname|dob       |gender|salary |country |
+---------+----------+--------+----------+------+-------+--------+
|James    |          |Smith   |1991-04-01|M     |200000 |Eswatini|
|Micheal  |          |Rose    |2000-05-19|M     |4300000|Eswatini|
|Maria    |Happiness |Williams|2002-01-24|F     |2500000|Eswatini|
|Jen      |Promise   |Mary    |2012-02-06|F     |-3000  |Eswatini|
|Robert   |Ncedo     |Brown   |2007-12-13|M     |0      |Eswatini|
+---------+----------+--------+----------+------+-------+--------+



In [14]:
dataFrame.show(truncate=True)

+---------+----------+--------+----------+------+-------+--------+
|firstname|middlename|lastname|       dob|gender| salary| country|
+---------+----------+--------+----------+------+-------+--------+
|    James|          |   Smith|1991-04-01|     M| 200000|Eswatini|
|  Micheal|          |    Rose|2000-05-19|     M|4300000|Eswatini|
|    Maria| Happiness|Williams|2002-01-24|     F|2500000|Eswatini|
|      Jen|   Promise|    Mary|2012-02-06|     F|  -3000|Eswatini|
|   Robert|     Ncedo|   Brown|2007-12-13|     M|      0|Eswatini|
+---------+----------+--------+----------+------+-------+--------+



                                                                                

## Day Two Practical

In [15]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, lit, when, avg, count

spark = SparkSession.builder.appName("Operations").getOrCreate()

df = spark.createDataFrame(
    data=[
        Row("Simphiwe", 25, "Siteki", 2000),
        Row("Bandile", 26, "Manzini", 3000),
        Row("Nombuso", 35, "Matsapha", 5000),
        Row("Faith", 17, "Kwaluseni", 1000),
        Row("Bunmi", 37, "Tubungu", 3500),
        Row("Godwin", None, "Mbabane", 5000),        
    ],
    schema=['Firstname', 'Age', 'City', 'Salary']
)

df.show(truncate=False)

25/09/17 15:20:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+---------+----+---------+------+
|Firstname|Age |City     |Salary|
+---------+----+---------+------+
|Simphiwe |25  |Siteki   |2000  |
|Bandile  |26  |Manzini  |3000  |
|Nombuso  |35  |Matsapha |5000  |
|Faith    |17  |Kwaluseni|1000  |
|Bunmi    |37  |Tubungu  |3500  |
|Godwin   |NULL|Mbabane  |5000  |
+---------+----+---------+------+



In [16]:
df.printSchema()

root
 |-- Firstname: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- City: string (nullable = true)
 |-- Salary: long (nullable = true)



In [17]:
df.select("Firstname", "Age", "Salary").show() # select only specific columns

+---------+----+------+
|Firstname| Age|Salary|
+---------+----+------+
| Simphiwe|  25|  2000|
|  Bandile|  26|  3000|
|  Nombuso|  35|  5000|
|    Faith|  17|  1000|
|    Bunmi|  37|  3500|
|   Godwin|NULL|  5000|
+---------+----+------+



                                                                                

In [18]:
df.where(df.Age <30).show()

                                                                                

+---------+---+---------+------+
|Firstname|Age|     City|Salary|
+---------+---+---------+------+
| Simphiwe| 25|   Siteki|  2000|
|  Bandile| 26|  Manzini|  3000|
|    Faith| 17|Kwaluseni|  1000|
+---------+---+---------+------+



In [19]:
df.filter(col('age')>20).show()

+---------+---+--------+------+
|Firstname|Age|    City|Salary|
+---------+---+--------+------+
| Simphiwe| 25|  Siteki|  2000|
|  Bandile| 26| Manzini|  3000|
|  Nombuso| 35|Matsapha|  5000|
|    Bunmi| 37| Tubungu|  3500|
+---------+---+--------+------+



In [20]:
df.filter((df.Age>20) & (df.City=="Tubungu") | (df.Salary < 1000)).show()

+---------+---+-------+------+
|Firstname|Age|   City|Salary|
+---------+---+-------+------+
|    Bunmi| 37|Tubungu|  3500|
+---------+---+-------+------+



Conditional Statements

- use the filter() method
- & for AND
- | for OR
- ! for NOT

In [21]:
df = df.withColumn("Bonus", col("Salary")*0.2)
df.show()

+---------+----+---------+------+------+
|Firstname| Age|     City|Salary| Bonus|
+---------+----+---------+------+------+
| Simphiwe|  25|   Siteki|  2000| 400.0|
|  Bandile|  26|  Manzini|  3000| 600.0|
|  Nombuso|  35| Matsapha|  5000|1000.0|
|    Faith|  17|Kwaluseni|  1000| 200.0|
|    Bunmi|  37|  Tubungu|  3500| 700.0|
|   Godwin|NULL|  Mbabane|  5000|1000.0|
+---------+----+---------+------+------+



                                                                                

In [22]:
df.withColumn("NewSalary", (col("Salary") + col("Bonus"))).show()

+---------+----+---------+------+------+---------+
|Firstname| Age|     City|Salary| Bonus|NewSalary|
+---------+----+---------+------+------+---------+
| Simphiwe|  25|   Siteki|  2000| 400.0|   2400.0|
|  Bandile|  26|  Manzini|  3000| 600.0|   3600.0|
|  Nombuso|  35| Matsapha|  5000|1000.0|   6000.0|
|    Faith|  17|Kwaluseni|  1000| 200.0|   1200.0|
|    Bunmi|  37|  Tubungu|  3500| 700.0|   4200.0|
|   Godwin|NULL|  Mbabane|  5000|1000.0|   6000.0|
+---------+----+---------+------+------+---------+



In [23]:
df.printSchema()

root
 |-- Firstname: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- City: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Bonus: double (nullable = true)



In [24]:
df = df.withColumnRenamed("Firstname", "Name")
df.show()

+--------+----+---------+------+------+
|    Name| Age|     City|Salary| Bonus|
+--------+----+---------+------+------+
|Simphiwe|  25|   Siteki|  2000| 400.0|
| Bandile|  26|  Manzini|  3000| 600.0|
| Nombuso|  35| Matsapha|  5000|1000.0|
|   Faith|  17|Kwaluseni|  1000| 200.0|
|   Bunmi|  37|  Tubungu|  3500| 700.0|
|  Godwin|NULL|  Mbabane|  5000|1000.0|
+--------+----+---------+------+------+



In [25]:
df\
    .withColumn("City", when(col("City")=="Manzini", lit("Ludzeludze"))\
    .otherwise(col("City"))).show()
            

+--------+----+----------+------+------+
|    Name| Age|      City|Salary| Bonus|
+--------+----+----------+------+------+
|Simphiwe|  25|    Siteki|  2000| 400.0|
| Bandile|  26|Ludzeludze|  3000| 600.0|
| Nombuso|  35|  Matsapha|  5000|1000.0|
|   Faith|  17| Kwaluseni|  1000| 200.0|
|   Bunmi|  37|   Tubungu|  3500| 700.0|
|  Godwin|NULL|   Mbabane|  5000|1000.0|
+--------+----+----------+------+------+



In [26]:
# Print average Salary by city

df.groupBy('City').agg(avg("Salary").alias("Avg Salary"), count("*").alias("Num employees")).show()

[Stage 36:>                                                         (0 + 4) / 4]

+---------+----------+-------------+
|     City|Avg Salary|Num employees|
+---------+----------+-------------+
|   Siteki|    2000.0|            1|
| Matsapha|    5000.0|            1|
|  Manzini|    3000.0|            1|
|Kwaluseni|    1000.0|            1|
|  Mbabane|    5000.0|            1|
|  Tubungu|    3500.0|            1|
+---------+----------+-------------+



                                                                                

In [27]:
df.orderBy(col("Age").desc()).show()

[Stage 39:>                                                         (0 + 4) / 4]

+--------+----+---------+------+------+
|    Name| Age|     City|Salary| Bonus|
+--------+----+---------+------+------+
|   Bunmi|  37|  Tubungu|  3500| 700.0|
| Nombuso|  35| Matsapha|  5000|1000.0|
| Bandile|  26|  Manzini|  3000| 600.0|
|Simphiwe|  25|   Siteki|  2000| 400.0|
|   Faith|  17|Kwaluseni|  1000| 200.0|
|  Godwin|NULL|  Mbabane|  5000|1000.0|
+--------+----+---------+------+------+



                                                                                

In [28]:
from pyspark.sql.functions import desc
df.orderBy(desc("Age")).show()

+--------+----+---------+------+------+
|    Name| Age|     City|Salary| Bonus|
+--------+----+---------+------+------+
|   Bunmi|  37|  Tubungu|  3500| 700.0|
| Nombuso|  35| Matsapha|  5000|1000.0|
| Bandile|  26|  Manzini|  3000| 600.0|
|Simphiwe|  25|   Siteki|  2000| 400.0|
|   Faith|  17|Kwaluseni|  1000| 200.0|
|  Godwin|NULL|  Mbabane|  5000|1000.0|
+--------+----+---------+------+------+



The difference between these two functions is that in the first one we are using the `col` method which returns a `Column` and from that column we then sort in discending order whilst on the second one we simply call the `desc` function from `pyspark.sql.functions` and pass the attribute name as a parameter. Basically, it is the syntax which differs but the end results are the same.

In [29]:
df.fillna({"Age": 0}).show()




+--------+---+---------+------+------+
|    Name|Age|     City|Salary| Bonus|
+--------+---+---------+------+------+
|Simphiwe| 25|   Siteki|  2000| 400.0|
| Bandile| 26|  Manzini|  3000| 600.0|
| Nombuso| 35| Matsapha|  5000|1000.0|
|   Faith| 17|Kwaluseni|  1000| 200.0|
|   Bunmi| 37|  Tubungu|  3500| 700.0|
|  Godwin|  0|  Mbabane|  5000|1000.0|
+--------+---+---------+------+------+



                                                                                

In [30]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Age'], outputCols=['Age1'], strategy='mean')
model = imputer.fit(df)
df_imputed = model.transform(df)
df_imputed.show()

                                                                                

+--------+----+---------+------+------+----+
|    Name| Age|     City|Salary| Bonus|Age1|
+--------+----+---------+------+------+----+
|Simphiwe|  25|   Siteki|  2000| 400.0|  25|
| Bandile|  26|  Manzini|  3000| 600.0|  26|
| Nombuso|  35| Matsapha|  5000|1000.0|  35|
|   Faith|  17|Kwaluseni|  1000| 200.0|  17|
|   Bunmi|  37|  Tubungu|  3500| 700.0|  37|
|  Godwin|NULL|  Mbabane|  5000|1000.0|  28|
+--------+----+---------+------+------+----+

