In [1]:
# Section must be included at the beginning of each new notebook. Remember to change the app name.
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import *
spark = SparkSession.builder.appName('logistic_regression_adv').getOrCreate()

# If you're getting an error with numpy, please type 'sudo pip3 install numpy --user' into the console.
# If you're getting an error with another package, type 'sudo pip3 install PACKAGENAME --user'. 
# Replace PACKAGENAME with the relevant package (such as pandas, etc).
from pyspark.ml.classification import LogisticRegression

# Import data and print schema - columns is another way to view the data's features.
df1 = spark.read.csv('winequality-white.csv', header=True, inferSchema=True)
df1.printSchema()
print(df1.columns)

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [2]:
# Import pandas.
import pandas as pd

# Take the first five rows of data, and visualise.
pd.DataFrame(df1.take(5), columns=df1.columns)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
# To visualise the first five columns, simply add transpose. 
pd.DataFrame(df1.take(5), columns=df1.columns).transpose()

Unnamed: 0,0,1,2,3,4
fixed acidity,7.0,6.3,8.1,7.2,7.2
volatile acidity,0.27,0.3,0.28,0.23,0.23
citric acid,0.36,0.34,0.4,0.32,0.32
residual sugar,20.7,1.6,6.9,8.5,8.5
chlorides,0.045,0.049,0.05,0.058,0.058
free sulfur dioxide,45.0,14.0,30.0,47.0,47.0
total sulfur dioxide,170.0,132.0,97.0,186.0,186.0
density,1.001,0.994,0.9951,0.9956,0.9956
pH,3.0,3.3,3.26,3.19,3.19
sulphates,0.45,0.49,0.44,0.4,0.4


In [5]:
# We can use group by and count to find out how many data points we have for each class in our predictor. 
df1.groupby('quality').count().toPandas()

Unnamed: 0,quality,count
0,6,2198
1,3,20
2,5,1457
3,9,5
4,4,163
5,8,175
6,7,880


In [7]:
# Using a for loop to find all columns that belong to the Double data type. 
numeric_features = [t[0] for t in df1.dtypes if t[1] == 'double']

# Selecting the numeric features, generating summary statistics, and converting to a Pandas DataFrame.
df1.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
fixed acidity,4898,6.854787668436075,0.8438682276875127,3.8,14.2
volatile acidity,4898,0.27824111882401087,0.10079454842486532,0.08,1.1
citric acid,4898,0.33419150673743736,0.12101980420298254,0.0,1.66
residual sugar,4898,6.391414863209486,5.072057784014878,0.6,65.8
chlorides,4898,0.0457723560636995,0.021847968093728805,0.009,0.346
free sulfur dioxide,4898,35.30808493262556,17.00713732523259,2.0,289.0
total sulfur dioxide,4898,138.36065741118824,42.498064554142985,9.0,440.0
density,4898,0.9940273764801896,0.002990906916936997,0.98711,1.03898
pH,4898,3.1882666394446693,0.15100059961506673,2.72,3.82


In [8]:
# Now that we've carried out some basic data exploration, let's select the relevant features. Day/month have been excluded as they're irrelevant.
df1 = df1.select('fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol')
df1.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)



In [9]:
# Import the relevant packages.
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

# First create a string indexer which converts every string into a number, such as male = 0 and female = 1.
# A number will be assigned to every category in the column.
fa_indexer = DoubleIndexer(inputCol='fixed acidity',outputCol='FA')
#job_indexer = StringIndexer(inputCol='job',outputCol='jobIndex')

# Now we can one hot encode these numbers. This converts the various outputs into a single vector.
# Multiple columns are collapsed into one. 
# This makes it easier to process when you have multiple classes.
#job_encoder = OneHotEncoder(inputCol='jobIndex',outputCol='jobVec')

# And finally, using vector assembler to turn all of these columns into one column (named features).
assembler = VectorAssembler(inputCols=[('fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
                                        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
                                        'pH', 'sulphates', 'alcohol')], outputCol="features")

TypeError: Invalid param value given for param "inputCols". Could not convert [('fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol')] to list of strings