In [9]:
#Importing the packages
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as func

In [10]:
#Creating the SparkSession
spark = SparkSession.builder.appName("FirstApp").getOrCreate()

In [11]:
#Defining schema for your DataFrame
myschema = StructType([\
                       StructField("userID", IntegerType(), True),
                       StructField("name", StringType(), True),
                       StructField("age",IntegerType(), True),
                       StructField("friends",IntegerType(), True),
                        ])

In [12]:
#Creating DataFrame on a CSV file
people = spark.read.format("csv")\
    .schema(myschema)\
    .option("path","fakefriends.csv")\
    .load()

In [13]:
#Performing all thetransformations
output = people.select(people.userID,people.name\
                       ,people.age,people.friends)\
         .where(people.age < 30).withColumn('insert_ts', func.current_timestamp())\
         .orderBy(people.userID)

In [14]:
#taking the count of o/p DataFrame
output.count()

112

In [15]:
#Creating a Temp View
output.createOrReplaceTempView("peoples")

In [16]:
#Running a simple Spark SQL query
spark.sql("select name,age,friends,insert_ts from peoples").show(2)

+--------+---+-------+--------------------+
|    name|age|friends|           insert_ts|
+--------+---+-------+--------------------+
|Jean-Luc| 26|      2|2024-03-15 14:41:...|
|    Hugh| 27|    181|2024-03-15 14:41:...|
+--------+---+-------+--------------------+
only showing top 2 rows

