In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MyFirstPySpark").getOrCreate()

In [5]:
data = [("Apple", 5), ("Banana", 343), ("Orange", 9)]
columns = ["Fruit", "Count"]
fruit_box = spark.createDataFrame(data, columns)
fruit_box.show()

+------+-----+
| Fruit|Count|
+------+-----+
| Apple|    5|
|Banana|  343|
|Orange|    9|
+------+-----+



In [6]:
fruit_box.select("Fruit").show()

+------+
| Fruit|
+------+
| Apple|
|Banana|
|Orange|
+------+



In [7]:
fruit_box.select("Count").show()

+-----+
|Count|
+-----+
|    5|
|  343|
|    9|
+-----+



In [8]:
fruit_box.filter(fruit_box["Count"] > 234).show()

+------+-----+
| Fruit|Count|
+------+-----+
|Banana|  343|
+------+-----+



In [9]:
from pyspark.sql.functions import when
fruit_box = fruit_box.withColumn("IsPopular", when(fruit_box["Count"] > 5, "Yes").otherwise("No"))
fruit_box = fruit_box.withColumn("Cost per fruit", when(fruit_box["Fruit"] == "Apple", "20$").when(fruit_box["Fruit"] == "Banana", "30$").when(fruit_box["Fruit"] == "Orange", "10$"). otherwise("No"))
fruit_box.show()

+------+-----+---------+--------------+
| Fruit|Count|IsPopular|Cost per fruit|
+------+-----+---------+--------------+
| Apple|    5|       No|           20$|
|Banana|  343|      Yes|           30$|
|Orange|    9|      Yes|           10$|
+------+-----+---------+--------------+



In [10]:
### Now lets practise on the data set provided
df = spark.read.csv("fruits.csv", header=True, inferSchema=True)
df.show()

+----+-------------+-----+
|S.no|        Fruit|Count|
+----+-------------+-----+
|   1|        Apple|    5|
|   2|        Mango|    6|
|   3|       Banana|    3|
|   4|       Cherry|    7|
|   5|Straw berries|    5|
|   6|       Orange|    9|
|   7|       Grapes|    7|
|   8|  BlueBerries|    8|
|   9|         Kiwi|   23|
|  10|       Leechi|    5|
+----+-------------+-----+



In [11]:
df.select("Fruit").show()
df.select("Count").show()


+-------------+
|        Fruit|
+-------------+
|        Apple|
|        Mango|
|       Banana|
|       Cherry|
|Straw berries|
|       Orange|
|       Grapes|
|  BlueBerries|
|         Kiwi|
|       Leechi|
+-------------+

+-----+
|Count|
+-----+
|    5|
|    6|
|    3|
|    7|
|    5|
|    9|
|    7|
|    8|
|   23|
|    5|
+-----+



In [12]:
df.filter(df["Count"]>10).show()
df.filter(df["Count"]>4).show()
df.filter(df["Count"]<5).show()

+----+-----+-----+
|S.no|Fruit|Count|
+----+-----+-----+
|   9| Kiwi|   23|
+----+-----+-----+

+----+-------------+-----+
|S.no|        Fruit|Count|
+----+-------------+-----+
|   1|        Apple|    5|
|   2|        Mango|    6|
|   4|       Cherry|    7|
|   5|Straw berries|    5|
|   6|       Orange|    9|
|   7|       Grapes|    7|
|   8|  BlueBerries|    8|
|   9|         Kiwi|   23|
|  10|       Leechi|    5|
+----+-------------+-----+

+----+------+-----+
|S.no| Fruit|Count|
+----+------+-----+
|   3|Banana|    3|
+----+------+-----+



In [13]:
from pyspark.sql.functions import when
df = df.withColumn("IsPopular", when(df["Count"]>5, "famous fruit for people").otherwise("Not Famous"))
df = df.withColumn("Cost of one fruit", when(df['Fruit'] == "Apple", "2$").when(df['Fruit'] == "Mango", "1$").when(df['Fruit'] == "Banana", "0.9$").when(df['Fruit'] == "Cherry", "5$").when(df['Fruit'] == "Straw berries", "20$").when(df['Fruit'] == "Orange", "2$").when(df['Fruit'] == "Grapes", "5$").when(df['Fruit'] == "BlueBerries", "10$").when(df['Fruit'] == "Kiwi", "9$").when(df['Fruit'] == "Leechi", "40$"))
df.show()

+----+-------------+-----+--------------------+-----------------+
|S.no|        Fruit|Count|           IsPopular|Cost of one fruit|
+----+-------------+-----+--------------------+-----------------+
|   1|        Apple|    5|          Not Famous|               2$|
|   2|        Mango|    6|famous fruit for ...|               1$|
|   3|       Banana|    3|          Not Famous|             0.9$|
|   4|       Cherry|    7|famous fruit for ...|               5$|
|   5|Straw berries|    5|          Not Famous|              20$|
|   6|       Orange|    9|famous fruit for ...|               2$|
|   7|       Grapes|    7|famous fruit for ...|               5$|
|   8|  BlueBerries|    8|famous fruit for ...|              10$|
|   9|         Kiwi|   23|famous fruit for ...|               9$|
|  10|       Leechi|    5|          Not Famous|              40$|
+----+-------------+-----+--------------------+-----------------+



In [14]:
# Practise the dataframe of animals and their ages
animals = [("Dog", 5), ("Cat", 3), ("Elephant", 20), ("Mouse", 1)]
animal_df = spark.createDataFrame(animals, ["Animal", "Age"])
animal_df.show()

+--------+---+
|  Animal|Age|
+--------+---+
|     Dog|  5|
|     Cat|  3|
|Elephant| 20|
|   Mouse|  1|
+--------+---+



In [15]:
# 1. Show only animals older than 4 years and less than 5 years is equal to 20
# 2. Add a column "IsPet" with "Yes" for Dog and Cat, "No" for others
# 3. Show the average age of all animals
animal_df.filter(animal_df["Age"] > 4).show()
animal_df.filter(animal_df["Age"] < 5).show()
animal_df.filter(animal_df["Age"] == 20).show()
animal_df.filter(animal_df["Age"] <= 5).show()
animal_df.filter(animal_df["Age"] >= 4).show()

+--------+---+
|  Animal|Age|
+--------+---+
|     Dog|  5|
|Elephant| 20|
+--------+---+

+------+---+
|Animal|Age|
+------+---+
|   Cat|  3|
| Mouse|  1|
+------+---+

+--------+---+
|  Animal|Age|
+--------+---+
|Elephant| 20|
+--------+---+

+------+---+
|Animal|Age|
+------+---+
|   Dog|  5|
|   Cat|  3|
| Mouse|  1|
+------+---+

+--------+---+
|  Animal|Age|
+--------+---+
|     Dog|  5|
|Elephant| 20|
+--------+---+



In [17]:
from pyspark.sql.functions import when
animal_df = animal_df.withColumn("Is Dangerous", when((animal_df["Animal"] == "Dog") | (animal_df["Animal"] == "Elephant"), "Dangerous").otherwise("Not Dangerous"))
animal_df.show()

+--------+---+-------------+
|  Animal|Age| Is Dangerous|
+--------+---+-------------+
|     Dog|  5|    Dangerous|
|     Cat|  3|Not Dangerous|
|Elephant| 20|    Dangerous|
|   Mouse|  1|Not Dangerous|
+--------+---+-------------+



In [18]:
spark.stop()