In [None]:
git add your_file.sql
git commit -m "Add SQL file"
git push

In [None]:
# File location and type
file_location = "/FileStore/tables/iot_devices.json"
file_type = "json"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [None]:
# Create a view or table

temp_table_name = "iot_devices_json"

df.createOrReplaceTempView(temp_table_name)
df.count()

In [None]:
''' With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, 
you can also create a table from the DataFrame. Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
To do so, choose your table name and uncomment the bottom line.'''

permanent_table_name = "iot_devices_json"

# df.write.format("parquet").saveAsTable(permanent_table_name)

In [None]:
'''
PySpark filter() function is used to create a new DataFrame by filtering the elements from an existing DataFrame based on the given condition or SQL expression. It is similar to Python’s filter() function but operates on distributed datasets. It is analogous to the SQL WHERE clause and allows you to apply filtering criteria to DataFrame rows.

Alternatively, if you have a background in SQL, you can opt to use the where() function instead of filter(). Both functions work identically. They generate a new DataFrame containing only the rows that satisfy the specified condition.'''

from pyspark.sql.functions import col, asc, count
TempFilter1 = df.filter(col("cn") == "Poland").filter(col("device_name").like("sensor-pad%"))
TempFilter1.count()

df.where((col('cn') == "Poland") & (col('device_name').like("sensor-pad%"))).count()

In [None]:
df.select("lcd").distinct().show()
distinct_color_count = df.select("lcd").distinct().count()
print(f"Number of distinct LCD colors: {distinct_color_count}")

In [None]:
mac_devices_df = df.filter(col("device_name").like("device-mac%"))

# Group by country ("cn") and count the number of MAC devices
country_mac_counts = mac_devices_df.groupBy("cn").agg(count("device_name").alias("mac_device_count"))

# Order by the count in descending order and select the top 5 countries
top_countries = country_mac_counts.orderBy("mac_device_count", ascending=False).limit(5)

# Show the results
top_countries.show()