In [None]:
from pyspark.sql import SparkSession
from pydataset import data

The `mtcars` dataset was extracted from the 1974 Motor Trend US magazine and comprises fuel consumption and ten aspects of automobile design and performance for 32 automobiles (1973–74 models).

Here are the variables included in the dataset:

1. **mpg**: Miles/(US) gallon
2. **cyl**: Number of cylinders
3. **disp**: Displacement (cu.in.)
4. **hp**: Gross horsepower
5. **drat**: Rear axle ratio
6. **wt**: Weight (1000 lbs)
7. **qsec**: 1/4 mile time
8. **vs**: Engine shape (0 = V-shaped, 1 = straight)
9. **am**: Transmission (0 = automatic, 1 = manual)
10. **gear**: Number of forward gears
11. **carb**: Number of carburetors

The rows each correspond to a different model of car.

In [None]:
# Load mtcars data into a pandas DataFrame
mtcars_pd = data('mtcars')
mtcars_pd

In [None]:
spark = SparkSession.builder.appName('pyspark_demo').getOrCreate()

In [None]:
# Convert pandas DataFrame to Spark DataFrame
mtcars_df = spark.createDataFrame(mtcars_pd)

mtcars_df.show()

In [None]:
mtcars_df.show(5)

In [None]:
mtcars_df.select('mpg', 'hp').show()

In [None]:
mtcars_df.filter(mtcars_df.mpg > 20).show()

In [None]:
# grouping and aggregation
mtcars_df.groupBy('cyl').avg('mpg').show()

In [None]:
# sorting
mtcars_df.sort(mtcars_df.mpg.desc()).show()

In [None]:
#  Creating a boolean column indicating if mpg > 20
from pyspark.sql.functions import col

mtcars_df = mtcars_df.withColumn('is_high_mpg', col('mpg') > 20)
mtcars_df.show()


In [None]:
# run SQL queries directly on the Spark DataFrame
mtcars_df.createOrReplaceTempView("mtcars_view")

result_df = spark.sql(
    "SELECT cyl, AVG(mpg) as avg_mpg FROM mtcars_view GROUP BY cyl"
)
result_df.show()

In [None]:
spark.stop()