In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.appName("SquareIntegers").getOrCreate()
data = [(1,), (2,), (3,), (4,), (5,)]
columns = ["Number"]
df = spark.createDataFrame(data, columns)
def square_function(number):
    return number ** 2
spark.udf.register("square_udf", square_function)
result_df = df.withColumn("Squared", col("Number").cast("double")).withColumn("Squared", expr("square_udf(Squared)"))
result_df.show()
df2=spark.sql("select ")
spark.stop()



+------+-------+
|Number|Squared|
+------+-------+
|     1|    1.0|
|     2|    4.0|
|     3|    9.0|
|     4|   16.0|
|     5|   25.0|
+------+-------+



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max
spark = SparkSession.builder.appName("MaxNumber").getOrCreate()
data = [(10,), (25,), (15,), (30,), (20,)]
columns = ["Number"]
df = spark.createDataFrame(data, columns)
max_value = df.agg(max("Number").alias("MaxNumber")).collect()[0]["MaxNumber"]
print("Maximum number:", max_value)
spark.stop()



Maximum number: 30


In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
spark = SparkSession.builder.appName("AverageNumbers").getOrCreate()
data = [(10,), (25,), (15,), (30,), (20,)]
columns = ["Number"]
df = spark.createDataFrame(data, columns)
average_value = df.agg(avg("Number").alias("AverageNumber")).collect()[0]["AverageNumber"]
print("Average number:", average_value)
spark.stop()



Average number: 20.0


In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()
csv_file_path = "./data.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show()
spark.stop()



+-----+------+
|Month|Volume|
+-----+------+
|  Jan|     5|
|  Feb|     6|
|  Mar|     7|
|  Apr|     5|
|  May|     6|
|  Jun|     9|
|  Jul|     0|
|  Aug|     7|
|  Sep|     4|
|  Oct|     3|
|  Nov|     1|
|  Dec|     2|
+-----+------+



In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DisplayDataFrame").getOrCreate()
df = spark.read.csv("./data.csv", header=True, inferSchema=True)
print("First few rows:")
df.show()
print("Schema:")
df.printSchema()
spark.stop()



First few rows:
+-----+------+
|Month|Volume|
+-----+------+
|  Jan|     5|
|  Feb|     6|
|  Mar|     7|
|  Apr|     5|
|  May|     6|
|  Jun|     9|
|  Jul|     0|
|  Aug|     7|
|  Sep|     4|
|  Oct|     3|
|  Nov|     1|
|  Dec|     2|
+-----+------+

Schema:
root
 |-- Month: string (nullable = true)
 |-- Volume: integer (nullable = true)



In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ColumnSummaryStats").getOrCreate()
df = spark.read.csv("./data.csv", header=True, inferSchema=True)
column_summary = df.describe()
column_summary.show()
spark.stop()



+-------+-----+-----------------+
|summary|Month|           Volume|
+-------+-----+-----------------+
|  count|   12|               12|
|   mean| null|4.583333333333333|
| stddev| null|2.678477631835372|
|    min|  Apr|                0|
|    max|  Sep|                9|
+-------+-----+-----------------+



In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.appName("SquareIntegers").getOrCreate()
data = [(1,), (2,), (3,), (4,), (5,)]
columns = ["Number"]
df = spark.createDataFrame(data, columns)
df.createOrReplaceTempView("NUMS")
df2 = spark.sql("SELECT Number,Number*Number as squared from NUMS")

df2.show()

+------+-------+
|Number|squared|
+------+-------+
|     1|      1|
|     2|      4|
|     3|      9|
|     4|     16|
|     5|     25|
+------+-------+



In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.appName("SquareIntegers").getOrCreate()
data = [(1,), (2,), (3,), (4,), (5,)]
columns = ["Number"]
df = spark.createDataFrame(data, columns)
df.createOrReplaceTempView("NUMS")
df2 = spark.sql("SELECT  MAX(Number) as maximum from NUMS")
df.show()
df2.show()

+------+
|Number|
+------+
|     1|
|     2|
|     3|
|     4|
|     5|
+------+

+-------+
|maximum|
+-------+
|      5|
+-------+



In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
spark = SparkSession.builder.appName("SquareIntegers").getOrCreate()
data = [(1,), (2,), (3,), (4,), (5,)]
columns = ["Number"]
df = spark.createDataFrame(data, columns)
df.createOrReplaceTempView("NUMS")
df2 = spark.sql("SELECT  AVG(Number) as average from NUMS")
df.show()
df2.show()

+------+
|Number|
+------+
|     1|
|     2|
|     3|
|     4|
|     5|
+------+

+-------+
|average|
+-------+
|    3.0|
+-------+



In [23]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()
csv_file_path = "./data.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show()
spark.stop()

+-----+------+
|Month|Volume|
+-----+------+
|  Jan|     5|
|  Feb|     6|
|  Mar|     7|
|  Apr|     5|
|  May|     6|
|  Jun|     9|
|  Jul|     0|
|  Aug|     7|
|  Sep|     4|
|  Oct|     3|
|  Nov|     1|
|  Dec|     2|
+-----+------+



In [24]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DisplayDataFrame").getOrCreate()
df = spark.read.csv("./data.csv", header=True, inferSchema=True)
print("First few rows:")
df.show()
print("Schema:")
df.printSchema()
spark.stop()



First few rows:
+-----+------+
|Month|Volume|
+-----+------+
|  Jan|     5|
|  Feb|     6|
|  Mar|     7|
|  Apr|     5|
|  May|     6|
|  Jun|     9|
|  Jul|     0|
|  Aug|     7|
|  Sep|     4|
|  Oct|     3|
|  Nov|     1|
|  Dec|     2|
+-----+------+

Schema:
root
 |-- Month: string (nullable = true)
 |-- Volume: integer (nullable = true)



In [25]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ColumnSummaryStats").getOrCreate()
df = spark.read.csv("./data.csv", header=True, inferSchema=True)
column_summary = df.describe()
column_summary.show()
spark.stop()



+-------+-----+-----------------+
|summary|Month|           Volume|
+-------+-----+-----------------+
|  count|   12|               12|
|   mean| null|4.583333333333333|
| stddev| null|2.678477631835372|
|    min|  Apr|                0|
|    max|  Sep|                9|
+-------+-----+-----------------+

