## LOGISTIC REGRESSION IMPLEMTATION

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("WineQualityPrediction").getOrCreate()

# Load training data
data_path = "/workspaces/ml-winequality/dataset/TrainingDataset.csv"

trainingData = spark.read.csv(data_path, header=True, 
                      inferSchema=True,
                      sep=';'
                      ,quote='"')

# Used copilot how to get rid of quotes from colum header
new_column_names = [col_name.strip('"') for col_name in trainingData.columns]
trainingData = trainingData.toDF(*new_column_names)

# Preprocess data: convert features into a feature vector, index labels
featureAssembler = VectorAssembler(inputCols=trainingData.columns[:-1], outputCol="features")
labelIndexer = StringIndexer(inputCol="quality", outputCol="label")

# Initialize classifier: start with Logistic Regression
classifier = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Create a Pipeline
pipeline = Pipeline(stages=[featureAssembler, labelIndexer, classifier])

# Train model
model = pipeline.fit(trainingData)

# Save model
# model.write().overwrite().save("s3://path/to/Model")




24/03/30 16:38:27 WARN Utils: Your hostname, codespaces-233249 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/03/30 16:38:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/30 16:38:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/30 16:38:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/03/30 16:38:40 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/03/30 16:38:40 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/03/30 16:38:41 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMet

In [3]:
# Save model
model.write().overwrite().save("/workspaces/ml-winequality/model/wine_quality_model_logistic")

## Model Evaluation and Tuning

In [4]:
# Load validation data
validpath = "/workspaces/ml-winequality/dataset/ValidationDataset.csv"
validationData = spark.read.csv(data_path, header=True, 
                      inferSchema=True,
                      sep=';'
                      ,quote='"')


# Used copilot how to get rid of quotes from colum header
new_column_names = [col_name.strip('"') for col_name in validationData.columns]
validationData = validationData.toDF(*new_column_names)

# Load model
from pyspark.ml import PipelineModel
model = PipelineModel.load("/workspaces/ml-winequality/model/wine_quality_model_logistic")

# Make predictions
predictions = model.transform(validationData)

# Evaluate model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print(f"F1 Score: {f1_score}")
# F1 Score: 0.5794684379508829


F1 Score: 0.5794684379508829
