In [None]:
!apt-get update # Update apt-get repository.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
!tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
!pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,351 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,847 kB]
Get:13 http://archive.ubuntu.com/ubuntu jamm

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
# Initialize findspark
import findspark
findspark.init()

In [None]:
'''
from google.colab import drive
drive.mount('/content/drive')
'''

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('sparkdf').getOrCreate()

data = [["1", "sravan", "IT", 45000],
        ["2", "ojaswi", "CS", 85000],
        ["3", "rohith", "CS", 41000],
        ["4", "sridevi", "IT", 56000],
        ["5", "bobby", "ECE", 45000],
        ["6", "gayatri", "ECE", 49000],
        ["7", "gnanesh", "CS", 45000],
        ["8", "bhanu", "Mech", 21000]]

columns = ['ID', 'NAME', 'DEPT', 'FEE']
df = spark.createDataFrame(data, columns)

df.show()

+---+-------+----+-----+
| ID|   NAME|DEPT|  FEE|
+---+-------+----+-----+
|  1| sravan|  IT|45000|
|  2| ojaswi|  CS|85000|
|  3| rohith|  CS|41000|
|  4|sridevi|  IT|56000|
|  5|  bobby| ECE|45000|
|  6|gayatri| ECE|49000|
|  7|gnanesh|  CS|45000|
|  8|  bhanu|Mech|21000|
+---+-------+----+-----+



In [None]:
from pyspark.sql import functions as f
gcols = ['DEPT']

df.groupBy(gcols).agg(
    f.sum('fee').alias("Dept wise total fees"),
    f.avg('fee').alias("Average fees per dept")
).show(truncate=False)

+----+--------------------+---------------------+
|DEPT|Dept wise total fees|Average fees per dept|
+----+--------------------+---------------------+
|ECE |94000               |47000.0              |
|IT  |101000              |50500.0              |
|CS  |171000              |57000.0              |
|Mech|21000               |21000.0              |
+----+--------------------+---------------------+



In [None]:
df.createOrReplaceTempView("students")
sql_str = "select dept, sum(fee) as Dept_wise_total_fees, avg(fee) as Average_fees_per_dept"\
" from students"\
" group by dept" \
" order by dept"

spark.sql(sql_str).show()

+----+--------------------+---------------------+
|dept|Dept_wise_total_fees|Average_fees_per_dept|
+----+--------------------+---------------------+
|  CS|              171000|              57000.0|
| ECE|               94000|              47000.0|
|  IT|              101000|              50500.0|
|Mech|               21000|              21000.0|
+----+--------------------+---------------------+



Inner join using join function

In [None]:
data = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 1"],
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"],
        ["5", "bobby", "company 1"]]

# specify column names
columns = ['ID', 'NAME', 'Company']

df1 = spark.createDataFrame(data, columns)
print ("df1 show")
df1.show()

data = [["1", "45000", "IT"],
         ["2", "145000", "Manager"],
         ["6", "45000", "HR"],
         ["5", "34000", "Sales"]]

# specify column names
columns = ['ID', 'salary', 'department']

df2 = spark.createDataFrame(data, columns)
print ("df2 show")
df2.show()

df1 show
+---+-------+---------+
| ID|   NAME|  Company|
+---+-------+---------+
|  1| sravan|company 1|
|  2| ojaswi|company 1|
|  3| rohith|company 2|
|  4|sridevi|company 1|
|  5|  bobby|company 1|
+---+-------+---------+

df2 show
+---+------+----------+
| ID|salary|department|
+---+------+----------+
|  1| 45000|        IT|
|  2|145000|   Manager|
|  6| 45000|        HR|
|  5| 34000|     Sales|
+---+------+----------+



In [None]:
#--join functions
df1.join(df2, df1.ID == df2.ID, "inner").show()

+---+------+---------+---+------+----------+
| ID|  NAME|  Company| ID|salary|department|
+---+------+---------+---+------+----------+
|  5| bobby|company 1|  5| 34000|     Sales|
|  1|sravan|company 1|  1| 45000|        IT|
|  2|ojaswi|company 1|  2|145000|   Manager|
+---+------+---------+---+------+----------+



Inner join using sql

In [None]:
df1.createOrReplaceTempView("emp")
df2.createOrReplaceTempView("dept")
sql_str = "select * from emp e, dept d where e.ID = d.ID"
spark.sql(sql_str).show()

+---+------+---------+---+------+----------+
| ID|  NAME|  Company| ID|salary|department|
+---+------+---------+---+------+----------+
|  5| bobby|company 1|  5| 34000|     Sales|
|  1|sravan|company 1|  1| 45000|        IT|
|  2|ojaswi|company 1|  2|145000|   Manager|
+---+------+---------+---+------+----------+



Union using function and sql query

In [None]:
#--create dataframes
df1 = spark.createDataFrame(
    [("Bhuwanesh", 82.98), ("Harshit", 80.31)],
    ["Student Name", "Overall Percentage"]
    )

df2 = spark.createDataFrame(
    [("Naveen", 91.123), ("Piyush", 90.51)],
    ["Student Name", "Overall Percentage"]
)

print ("Union using union function")
df1.union(df2).show()

print ("Union using SQL query")
df1.createOrReplaceTempView("set1")
df2.createOrReplaceTempView("set2")

sql_qry = "select * from set1"\
" union"\
" select * from set2"
spark.sql(sql_qry).show()


Union using union function
+------------+------------------+
|Student Name|Overall Percentage|
+------------+------------------+
|   Bhuwanesh|             82.98|
|     Harshit|             80.31|
|      Naveen|            91.123|
|      Piyush|             90.51|
+------------+------------------+

Union using SQL query
+------------+------------------+
|Student Name|Overall Percentage|
+------------+------------------+
|      Naveen|            91.123|
|   Bhuwanesh|             82.98|
|      Piyush|             90.51|
|     Harshit|             80.31|
+------------+------------------+

