In [1]:
"""
In Apache Spark, DataFrames are a higher-level abstraction built on top of RDDs, providing a more structured and efficient way to work with distributed data. DataFrame operations can be categorized into two types: Transformations and Actions.

Transformations:

Select/Project: Choose specific columns from the DataFrame.
Filter/Where: Apply a condition to filter rows.
GroupBy: Group the DataFrame based on one or more columns.
Sort/OrderBy: Sort the DataFrame based on one or more columns.
Join: Perform various types of joins between DataFrames.
Union: Combine two DataFrames with the same schema.
Drop: Remove specified columns from the DataFrame.
WithColumn: Add or replace a column with a new one.
Aggregations: Perform various aggregations like sum, avg, min, max, etc.
Distinct: Return distinct rows from the DataFrame.
Limit: Limit the number of rows in the DataFrame.
Actions:

Show: Display the content of the DataFrame in a tabular form.
Count: Return the number of rows in the DataFrame.
First: Return the first row of the DataFrame.
Collect: Return all rows of the DataFrame as an array to the driver program.
Write: Save the DataFrame to an external storage system (e.g., Parquet, CSV, etc.).
Describe: Compute summary statistics for numerical columns.
PrintSchema: Display the schema of the DataFrame.
Head: Return the first n rows of the DataFrame.
"""

'\nIn Apache Spark, DataFrames are a higher-level abstraction built on top of RDDs, providing a more structured and efficient way to work with distributed data. DataFrame operations can be categorized into two types: Transformations and Actions.\n\nTransformations:\n\nSelect/Project: Choose specific columns from the DataFrame.\nFilter/Where: Apply a condition to filter rows.\nGroupBy: Group the DataFrame based on one or more columns.\nSort/OrderBy: Sort the DataFrame based on one or more columns.\nJoin: Perform various types of joins between DataFrames.\nUnion: Combine two DataFrames with the same schema.\nDrop: Remove specified columns from the DataFrame.\nWithColumn: Add or replace a column with a new one.\nAggregations: Perform various aggregations like sum, avg, min, max, etc.\nDistinct: Return distinct rows from the DataFrame.\nLimit: Limit the number of rows in the DataFrame.\nActions:\n\nShow: Display the content of the DataFrame in a tabular form.\nCount: Return the number of r

In [5]:
from pyspark import *
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [16]:
data = [
    (1, "Electronics", 500.0),
    (2, "Clothing", 30.9),
    (3, "Electronics", 80.0),
    (4, "Clothing", 25.0),
    (5, "Books", 20.0),
    (6, "Electronics", 700.0),
    (7, "Books", 15.0),
    (8, "Clothing", 35.0),
    (9, "Electronics", 600.0),
    (10, "Books", 25.0)
]

In [17]:
product_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("category", StringType(), True),
    StructField("price", FloatType(), True)
])

In [18]:
spark = SparkSession.builder.appName("example").getOrCreate()



In [19]:
products_df = spark.createDataFrame(data,schema=product_schema)

In [23]:
products_df.show()

+----------+-----------+-----+
|product_id|   category|price|
+----------+-----------+-----+
|         1|Electronics|500.0|
|         2|   Clothing| 30.9|
|         3|Electronics| 80.0|
|         4|   Clothing| 25.0|
|         5|      Books| 20.0|
|         6|Electronics|700.0|
|         7|      Books| 15.0|
|         8|   Clothing| 35.0|
|         9|Electronics|600.0|
|        10|      Books| 25.0|
+----------+-----------+-----+



In [20]:
result_df = (
    products_df
    .groupBy("category")
    .agg(count("*").alias("product_count"))
    .orderBy(col("product_count").desc())
    .limit(3)
)

In [21]:
result_df.show()

+-----------+-------------+
|   category|product_count|
+-----------+-------------+
|Electronics|            4|
|   Clothing|            3|
|      Books|            3|
+-----------+-------------+



In [24]:
products_df.withColumn('sales',lit(1)).show()

+----------+-----------+-----+-----+
|product_id|   category|price|sales|
+----------+-----------+-----+-----+
|         1|Electronics|500.0|    1|
|         2|   Clothing| 30.9|    1|
|         3|Electronics| 80.0|    1|
|         4|   Clothing| 25.0|    1|
|         5|      Books| 20.0|    1|
|         6|Electronics|700.0|    1|
|         7|      Books| 15.0|    1|
|         8|   Clothing| 35.0|    1|
|         9|Electronics|600.0|    1|
|        10|      Books| 25.0|    1|
+----------+-----------+-----+-----+



In [41]:
products_df=products_df.withColumn('sales',when(products_df.product_id==6,24).when(products_df.product_id==8,8).otherwise(18))

In [42]:
products_df.show()

+----------+-----------+-----+-----+
|product_id|   category|price|sales|
+----------+-----------+-----+-----+
|         1|Electronics|500.0|   18|
|         2|   Clothing| 30.9|   18|
|         3|Electronics| 80.0|   18|
|         4|   Clothing| 25.0|   18|
|         5|      Books| 20.0|   18|
|         6|Electronics|700.0|   24|
|         7|      Books| 15.0|   18|
|         8|   Clothing| 35.0|    8|
|         9|Electronics|600.0|   18|
|        10|      Books| 25.0|   18|
+----------+-----------+-----+-----+



In [68]:
def ratiofun(x,y):
    return x//y

In [80]:
products_df=products_df.withColumn('price',products_df['price'].cast(IntegerType()))

In [81]:
products_df.show()

+----------+-----------+-----+-----+
|product_id|   category|price|sales|
+----------+-----------+-----+-----+
|         1|Electronics|  500|   18|
|         2|   Clothing|   30|   18|
|         3|Electronics|   80|   18|
|         4|   Clothing|   25|   18|
|         5|      Books|   20|   18|
|         6|Electronics|  700|   24|
|         7|      Books|   15|   18|
|         8|   Clothing|   35|    8|
|         9|Electronics|  600|   18|
|        10|      Books|   25|   18|
+----------+-----------+-----+-----+



In [92]:
products_df.withColumn('ratio',products_df.sales/products_df.price).show()

+----------+-----------+-----+-----+-------------------+
|product_id|   category|price|sales|              ratio|
+----------+-----------+-----+-----+-------------------+
|         1|Electronics|  500|   18|              0.036|
|         2|   Clothing|   30|   18|                0.6|
|         3|Electronics|   80|   18|              0.225|
|         4|   Clothing|   25|   18|               0.72|
|         5|      Books|   20|   18|                0.9|
|         6|Electronics|  700|   24|0.03428571428571429|
|         7|      Books|   15|   18|                1.2|
|         8|   Clothing|   35|    8|0.22857142857142856|
|         9|Electronics|  600|   18|               0.03|
|        10|      Books|   25|   18|               0.72|
+----------+-----------+-----+-----+-------------------+



In [76]:
self.spark=SparkSession.builder.master('local').appName('job posting Analysis').getOrCreate()
return self.spark

In [None]:
jobpostings.createOrReplaceTempView('jobs')
return self.spark.sql(f'select * from jobs  where department={department}')

In [None]:
jobpostings.createOrReplaceTempView('jobs')
num=self.spark.sql(f'select sum(numpositions) from jobs').first()[0]
return num