In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
         .appName("sparkpractice") \
         .master("local[1]") \
         .config("spark.executor.memory", "2g") \
         .config("spark.driver.memory", "2g") \
         .getOrCreate()

print(f"Spark Version: {spark.version}")


Spark Version: 3.5.4


In [2]:
spark

In [5]:
wine_df = spark.read.csv(r"C:\Users\sayed\Desktop\DDP\practice\dataset\WineQT.csv",header = True, inferSchema = True)

In [6]:
wine_df.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+---+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality| Id|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+---+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|  0|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|  1|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|  2|
|         11.2|            0.28|       0.56|           1.9|    0.075|       

In [7]:
wine_df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- Id: integer (nullable = true)



In [8]:
wine_df.count()

1143

In [9]:
wine_df.filter(wine_df["quality"]>7).show()

+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+------------------+----+---------+-------+-------+----+
|fixed acidity|volatile acidity|citric acid|residual sugar|           chlorides|free sulfur dioxide|total sulfur dioxide|           density|  pH|sulphates|alcohol|quality|  Id|
+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+------------------+----+---------+-------+-------+----+
|          7.9|            0.35|       0.46|           3.6|               0.078|               15.0|                37.0|            0.9973|3.35|     0.86|   12.8|      8| 267|
|         10.3|            0.32|       0.45|           6.4|               0.073|                5.0|                13.0|            0.9976|3.23|     0.82|   12.6|      8| 278|
|          5.6|            0.85|       0.05|           1.4|               0.045|               12.0|               

In [10]:
wine_df.agg({"alcohol":"mean"}).show()   # Getting the aggregate 

+-----------------+
|     avg(alcohol)|
+-----------------+
|10.44211140274131|
+-----------------+



In [11]:
wine_rdd = wine_df.rdd
print(f"RDD :{wine_rdd.collect()}")

RDD :[Row(fixed acidity=7.4, volatile acidity=0.7, citric acid=0.0, residual sugar=1.9, chlorides=0.076, free sulfur dioxide=11.0, total sulfur dioxide=34.0, density=0.9978, pH=3.51, sulphates=0.56, alcohol=9.4, quality=5, Id=0), Row(fixed acidity=7.8, volatile acidity=0.88, citric acid=0.0, residual sugar=2.6, chlorides=0.098, free sulfur dioxide=25.0, total sulfur dioxide=67.0, density=0.9968, pH=3.2, sulphates=0.68, alcohol=9.8, quality=5, Id=1), Row(fixed acidity=7.8, volatile acidity=0.76, citric acid=0.04, residual sugar=2.3, chlorides=0.092, free sulfur dioxide=15.0, total sulfur dioxide=54.0, density=0.997, pH=3.26, sulphates=0.65, alcohol=9.8, quality=5, Id=2), Row(fixed acidity=11.2, volatile acidity=0.28, citric acid=0.56, residual sugar=1.9, chlorides=0.075, free sulfur dioxide=17.0, total sulfur dioxide=60.0, density=0.998, pH=3.16, sulphates=0.58, alcohol=9.8, quality=6, Id=3), Row(fixed acidity=7.4, volatile acidity=0.7, citric acid=0.0, residual sugar=1.9, chlorides=0.0

In [14]:
wine_df.agg({"quality":"avg"}).show()

+-----------------+
|     avg(quality)|
+-----------------+
|5.657042869641295|
+-----------------+



In [18]:
avg_quality_rdd = wine_rdd.map(lambda row: row["quality"]).mean()
print(f"Average Quality (RDD):{avg_quality_rdd}")

Average Quality (RDD):5.657042869641292


In [20]:
filtered_df = wine_df.filter(wine_df["alcohol"]>10)
filtered_rdd = wine_df.rdd.filter(lambda row: row["alcohol"] > 10)
print(f"Filtered Rows (Dataframe):{filtered_df.count()}")
print(f"Filter Rows (RDD):{filtered_rdd.count()}")


Filtered Rows (Dataframe):609
Filter Rows (RDD):609
