In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
         .appName("sparkpractice") \
         .master("local[1]") \
         .config("spark.executor.memory", "2g") \
         .config("spark.driver.memory", "2g") \
         .getOrCreate()

print(f"Spark Version: {spark.version}")


Spark Version: 3.5.4


In [2]:
spark

In [5]:
wine_df = spark.read.csv(r"C:\Users\sayed\Desktop\DDP\practice\dataset\WineQT.csv",header = True, inferSchema = True)

In [6]:
wine_df.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+---+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality| Id|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+---+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|  0|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|  1|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|  2|
|         11.2|            0.28|       0.56|           1.9|    0.075|       

In [7]:
wine_df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- Id: integer (nullable = true)



In [8]:
wine_df.count()

1143

In [9]:
wine_df.filter(wine_df["quality"]>7).show()

+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+------------------+----+---------+-------+-------+----+
|fixed acidity|volatile acidity|citric acid|residual sugar|           chlorides|free sulfur dioxide|total sulfur dioxide|           density|  pH|sulphates|alcohol|quality|  Id|
+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+------------------+----+---------+-------+-------+----+
|          7.9|            0.35|       0.46|           3.6|               0.078|               15.0|                37.0|            0.9973|3.35|     0.86|   12.8|      8| 267|
|         10.3|            0.32|       0.45|           6.4|               0.073|                5.0|                13.0|            0.9976|3.23|     0.82|   12.6|      8| 278|
|          5.6|            0.85|       0.05|           1.4|               0.045|               12.0|               

In [10]:
wine_df.agg({"alcohol":"mean"}).show()   # Getting the aggregate 

+-----------------+
|     avg(alcohol)|
+-----------------+
|10.44211140274131|
+-----------------+



In [26]:
wine_rdd = wine_df.rdd
#print(f"RDD :{wine_rdd.collect()}")

In [14]:
wine_df.agg({"quality":"avg"}).show()

+-----------------+
|     avg(quality)|
+-----------------+
|5.657042869641295|
+-----------------+



In [18]:
avg_quality_rdd = wine_rdd.map(lambda row: row["quality"]).mean()
print(f"Average Quality (RDD):{avg_quality_rdd}")

Average Quality (RDD):5.657042869641292


In [20]:
filtered_df = wine_df.filter(wine_df["alcohol"]>10)
filtered_rdd = wine_df.rdd.filter(lambda row: row["alcohol"] > 10)
print(f"Filtered Rows (Dataframe):{filtered_df.count()}")
print(f"Filter Rows (RDD):{filtered_rdd.count()}")


Filtered Rows (Dataframe):609
Filter Rows (RDD):609


In [21]:
wine_df.describe().show()

+-------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+
|summary|     fixed acidity|   volatile acidity|        citric acid|    residual sugar|          chlorides|free sulfur dioxide|total sulfur dioxide|             density|                pH|         sulphates|           alcohol|           quality|               Id|
+-------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+
|  count|              1143|               1143|               1143|              1143|               1143|               1143|                1143|                1143|              1143|              1143| 

In [23]:
# Model Implementation 
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

assembler = VectorAssembler(inputCols=["alcohol","volatile acidity"],outputCol="features")
data =  assembler.transform(wine_df)
lr = LinearRegression(featuresCol="features",labelCol="quality")
model = lr.fit(data)


In [25]:
predictions = model.transform(data)
predictions.select("quality","prediction").show()


+-------+------------------+
|quality|        prediction|
+-------+------------------+
|      5| 5.088174856811273|
|      5| 4.953108261869783|
|      5| 5.126396360877945|
|      6| 5.819548756910594|
|      5| 5.088174856811273|
|      5| 5.145937556480661|
|      5| 5.232581605984741|
|      7| 5.347676561754136|
|      7| 5.292679344212123|
|      5| 5.199030179034059|
|      5| 5.367002535572162|
|      5|5.1244917658893305|
|      7| 6.038063475659412|
|      6| 5.574487726885078|
|      5|  5.62948494442709|
|      5|5.5092894679723265|
|      5| 5.391429030075557|
|      6| 5.615044269509744|
|      5|5.5046193908563374|
|      5|  5.53817081780702|
+-------+------------------+
only showing top 20 rows

