In [1]:
import findspark
findspark.init()

In [2]:
import os
from pyspark.sql import DataFrame
from pyspark.sql import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.tuning import *
from pyspark.ml.feature import Imputer
import numpy as np
import pandas as pd
import scipy as sc
from pandas import DataFrame, read_csv



In [3]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

In [4]:
sc = SparkContext()

In [5]:
spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("testing") \
        .config("spark.executor.instances", "4") \
        .config("spark.executor.memory","1g") \
        .config("spark.driver.memory","1g") \
        .config("spark.executor.cores",'1') \
        .config("spark.scheduler.mode","FIFO") \
        .getOrCreate()

The below code reads in the file from the hadoop file system. The lines below print out various characteristics of the data.

In [6]:
dat = spark.read.csv("./Downloads/wine.csv",header=True, inferSchema=True, sep=";")

In [7]:
dat.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [8]:
dat.columns

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [9]:
dat.count()

4898

In [10]:
print(dat.take(1))

[Row(fixed acidity=7.0, volatile acidity=0.27, citric acid=0.36, residual sugar=20.7, chlorides=0.045, free sulfur dioxide=45.0, total sulfur dioxide=170.0, density=1.001, pH=3.0, sulphates=0.45, alcohol=8.8, quality=6)]


In [11]:
for col in dat.schema.fields:
    print(col)

StructField(fixed acidity,DoubleType,true)
StructField(volatile acidity,DoubleType,true)
StructField(citric acid,DoubleType,true)
StructField(residual sugar,DoubleType,true)
StructField(chlorides,DoubleType,true)
StructField(free sulfur dioxide,DoubleType,true)
StructField(total sulfur dioxide,DoubleType,true)
StructField(density,DoubleType,true)
StructField(pH,DoubleType,true)
StructField(sulphates,DoubleType,true)
StructField(alcohol,DoubleType,true)
StructField(quality,IntegerType,true)


In [12]:
dat.rdd.getNumPartitions()
dat.printSchema()
dat = dat.withColumnRenamed("pH","ph")

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [13]:
dat.select('quality').show()

+-------+
|quality|
+-------+
|      6|
|      6|
|      6|
|      6|
|      6|
|      6|
|      6|
|      6|
|      6|
|      6|
|      5|
|      5|
|      5|
|      7|
|      5|
|      7|
|      6|
|      8|
|      6|
|      5|
+-------+
only showing top 20 rows



In [14]:
dat.groupby("quality").count().show()
dat = dat.drop("fixed acidity")
#shows average alcohol by volumne
dat.groupby("quality").avg("ph").show()

+-------+-----+
|quality|count|
+-------+-----+
|      6| 2198|
|      3|   20|
|      5| 1457|
|      9|    5|
|      4|  163|
|      8|  175|
|      7|  880|
+-------+-----+

+-------+------------------+
|quality|           avg(ph)|
+-------+------------------+
|      6|3.1885987261146482|
|      3|3.1874999999999996|
|      5| 3.168833218943034|
|      9|             3.308|
|      4|3.1828834355828244|
|      8|3.2186857142857175|
|      7|3.2138977272727316|
+-------+------------------+



In [15]:
dat =  dat.drop("fixed acidity")
dat =  dat.drop("volatile acidity")
dat.printSchema()

root
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



The below code is a user defined function that adds a new column to the data, which is the result of subtracting two columns in the data. We made this fucntion as there was a necessity for this. The columns total sulfur dioxide and free sulphur dioxide can be subtracted to created a column called bounded. The 2nd code cell does that.

In [16]:
def subtractColumn(data,column1, column2, newColName):
    newcol = data[column2] - data[column1]
    data = data.withColumn(newColName, newcol)
    return data

In [17]:
dat = subtractColumn(dat,"total sulfur dioxide", "free sulfur dioxide", "bounded")
dat.select('bounded').show()

+-------+
|bounded|
+-------+
| -125.0|
| -118.0|
|  -67.0|
| -139.0|
| -139.0|
|  -67.0|
| -106.0|
| -125.0|
| -118.0|
| -101.0|
|  -52.0|
|  -92.0|
|  -59.0|
|  -95.0|
| -131.0|
|  -84.0|
|  -69.0|
|  -46.0|
| -154.0|
|  -99.0|
+-------+
only showing top 20 rows



The following code implements the ML model. We are predicting the alcohol quality, which is a multiclass problem, based on the other columns. THe dataset is split into 75% training and 25% testing.

In [18]:
dat =dat.withColumnRenamed("quality", "label")
train, test = dat.randomSplit([0.75, 0.25])
y = train.select("label")
y = test.select("label")

from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

The ML model in Pypspark requires a vector of features which is what the next block does.

In [19]:
assembler = VectorAssembler(
    inputCols=[x for x in train.columns],
    outputCol='features')

train_processed = assembler.transform(train)
test_processed = assembler.transform(test)

train_processed.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[0.0,0.8,0.032,12...|
|[0.0,0.8,0.034,46...|
|[0.0,0.8,0.037,30...|
|[0.0,1.3,0.042,24...|
|[0.0,1.3,0.044,72...|
+--------------------+
only showing top 5 rows



Type Markdown and LaTeX:  α2

In [20]:
regression_model = LogisticRegression(maxIter=100, tol=1E-6, fitIntercept=True)
ovr = OneVsRest(classifier=regression_model)

ovrModel = ovr.fit(train_processed)


predictions = ovrModel.transform(test_processed)

In [21]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0785546
