# Reference doc
[PySpark을 이용한 머신러닝 튜토리얼 예제](https://www.sqler.com/board_MachineLearning_AI_tip_lecture/1102607)

In [2]:
# Create a SparkContext
import pyspark
from pyspark import SparkContext
sc = SparkContext()

21/10/01 01:42:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# RDD, Resilient Distributed Dataset
# Computation in an RDD is automatically parallelized across the cluster
nums = sc.parallelize([1,2,3,4])
nums.take(1)



[1]

In [4]:
squared = nums.map(lambda x: x*x).collect()
for num in squared:
    print('%i ' % (num))

1 
4 
9 
16 


In [5]:
# SQLContext
from pyspark.sql import Row
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [6]:
list_p = [('John',19),('Smith',29),('Adam',35),('Henry',50)]
rdd = sc.parallelize(list_p)
ppl = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
DF_ppl = sqlContext.createDataFrame(ppl)
DF_ppl.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [7]:
# Machine Learning Example with PySpark

In [8]:
from pyspark import SparkFiles

url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
sqlContext = SQLContext(sc)

In [9]:
df = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema= True)
df.printSchema()



root
 |-- x: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)





In [10]:
df.show(5, truncate = False)

+---+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|x  |age|workclass|fnlwgt|education   |educational-num|marital-status    |occupation       |relationship|race |gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|1  |25 |Private  |226802|11th        |7              |Never-married     |Machine-op-inspct|Own-child   |Black|Male  |0           |0           |40            |United-States |<=50K |
|2  |38 |Private  |89814 |HS-grad     |9              |Married-civ-spouse|Farming-fishing  |Husband     |White|Male  |0           |0           |50            |United-States |<=50K |
|3  |28 |Local-gov|336951|Assoc-acdm  |12             |Married-civ-spouse|Protective-serv 

In [11]:
df_string = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema=  False)
df_string.printSchema()

root
 |-- x: string (nullable = true)
 |-- age: string (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: string (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: string (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: string (nullable = true)
 |-- capital-loss: string (nullable = true)
 |-- hours-per-week: string (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [12]:
# Import all from `sql.types`
# from pyspark.sql.types import *
from pyspark.sql.types import FloatType

# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 
# List of continuous features
CONTI_FEATURES  = ['age', 'fnlwgt','capital-gain', 'educational-num', 'capital-loss', 'hours-per-week']
# Convert the type
df_string = convertColumn(df_string, CONTI_FEATURES, FloatType())
# Check the dataset
df_string.printSchema()

root
 |-- x: string (nullable = true)
 |-- age: float (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: float (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: float (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: float (nullable = true)
 |-- capital-loss: float (nullable = true)
 |-- hours-per-week: float (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [13]:
df.select('age','fnlwgt').show(5)

+---+------+
|age|fnlwgt|
+---+------+
| 25|226802|
| 38| 89814|
| 28|336951|
| 44|160323|
| 18|103497|
+---+------+
only showing top 5 rows



In [14]:
df.groupBy("education").count().sort("count",ascending=True).show()



+------------+-----+
|   education|count|
+------------+-----+
|   Preschool|   83|
|     1st-4th|  247|
|     5th-6th|  509|
|   Doctorate|  594|
|        12th|  657|
|         9th|  756|
| Prof-school|  834|
|     7th-8th|  955|
|        10th| 1389|
|  Assoc-acdm| 1601|
|        11th| 1812|
|   Assoc-voc| 2061|
|     Masters| 2657|
|   Bachelors| 8025|
|Some-college|10878|
|     HS-grad|15784|
+------------+-----+





In [15]:
df.describe().show()



+-------+------------------+------------------+-----------+------------------+------------+------------------+--------------+----------------+------------+------------------+------+------------------+-----------------+------------------+--------------+------+
|summary|                 x|               age|  workclass|            fnlwgt|   education|   educational-num|marital-status|      occupation|relationship|              race|gender|      capital-gain|     capital-loss|    hours-per-week|native-country|income|
+-------+------------------+------------------+-----------+------------------+------------+------------------+--------------+----------------+------------+------------------+------+------------------+-----------------+------------------+--------------+------+
|  count|             48842|             48842|      48842|             48842|       48842|             48842|         48842|           48842|       48842|             48842| 48842|             48842|            48842|  



In [16]:
df.describe('capital-gain').show()

+-------+------------------+
|summary|      capital-gain|
+-------+------------------+
|  count|             48842|
|   mean|1079.0676262233324|
| stddev| 7452.019057655413|
|    min|                 0|
|    max|             99999|
+-------+------------------+



In [17]:
df.crosstab('age', 'income').sort("age_income").show()



+----------+-----+----+
|age_income|<=50K|>50K|
+----------+-----+----+
|        17|  595|   0|
|        18|  862|   0|
|        19| 1050|   3|
|        20| 1112|   1|
|        21| 1090|   6|
|        22| 1161|  17|
|        23| 1307|  22|
|        24| 1162|  44|
|        25| 1119|  76|
|        26| 1068|  85|
|        27| 1117| 115|
|        28| 1101| 179|
|        29| 1025| 198|
|        30| 1031| 247|
|        31| 1050| 275|
|        32|  957| 296|
|        33| 1045| 290|
|        34|  949| 354|
|        35|  997| 340|
|        36|  948| 400|
+----------+-----+----+
only showing top 20 rows



In [18]:
df.drop('education_num').columns

['x',
 'age',
 'workclass',
 'fnlwgt',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [19]:
df.filter(df.age > 40).count()

20211

In [20]:
df.groupby('marital-status').agg({'capital-gain': 'mean'}).show()



+--------------------+------------------+
|      marital-status| avg(capital-gain)|
+--------------------+------------------+
|           Separated| 581.8424836601307|
|       Never-married|  384.382639449029|
|Married-spouse-ab...| 629.0047770700637|
|            Divorced| 793.6755615860094|
|             Widowed| 603.6442687747035|
|   Married-AF-spouse|2971.6216216216217|
|  Married-civ-spouse|1739.7006121810625|
+--------------------+------------------+



In [21]:
# Step 2) Data preprocessing
from pyspark.sql.functions import col, asc

# 1 Select the column
age_square = df.select(col("age")**2)

# 2 Apply the transformation and add it to the DataFrame
df = df.withColumn("age_square", col("age")**2)
df.printSchema()

root
 |-- x: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- age_square: double (nullable = true)



In [22]:
df.first()

Row(x=1, age=25, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K', age_square=625.0)

In [23]:
df.filter(df['native-country'] == 'Holand-Netherlands').count()


1

In [24]:
df.groupby('native-country').agg({'native-country': 'count'}).sort(asc("count(native-country)")).show()



+--------------------+---------------------+
|      native-country|count(native-country)|
+--------------------+---------------------+
|  Holand-Netherlands|                    1|
|             Hungary|                   19|
|            Honduras|                   20|
|            Scotland|                   21|
|Outlying-US(Guam-...|                   23|
|          Yugoslavia|                   23|
|                Laos|                   23|
|     Trinadad&Tobago|                   27|
|            Cambodia|                   28|
|                Hong|                   30|
|            Thailand|                   30|
|             Ireland|                   37|
|              France|                   38|
|             Ecuador|                   45|
|                Peru|                   46|
|              Greece|                   49|
|           Nicaragua|                   49|
|                Iran|                   59|
|              Taiwan|                   65|
|         



In [25]:
df_remove = df.filter(df['native-country'] != 'Holand-Netherlands')
df_remove.groupby('native-country').agg({'native-country': 'count'}).sort(asc("count(native-country)")).show()



+--------------------+---------------------+
|      native-country|count(native-country)|
+--------------------+---------------------+
|             Hungary|                   19|
|            Honduras|                   20|
|            Scotland|                   21|
|          Yugoslavia|                   23|
|Outlying-US(Guam-...|                   23|
|                Laos|                   23|
|     Trinadad&Tobago|                   27|
|            Cambodia|                   28|
|                Hong|                   30|
|            Thailand|                   30|
|             Ireland|                   37|
|              France|                   38|
|             Ecuador|                   45|
|                Peru|                   46|
|           Nicaragua|                   49|
|              Greece|                   49|
|                Iran|                   59|
|              Taiwan|                   65|
|            Portugal|                   67|
|         



In [26]:
# Step 3) Build a data processing pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

stringIndexer = StringIndexer(inputCol="workclass", outputCol="workclass_encoded")
model = stringIndexer.fit(df)
indexed = model.transform(df)
encoder = OneHotEncoder(dropLast=False, inputCol="workclass_encoded", outputCol="workclass_vec")
# encoded = encoder.transform(indexed)  # error
ohe = encoder.fit(indexed)
encoded = ohe.transform(indexed)
encoded.show(2)

# error 
# AttributeError: 'OneHotEncoder' object has no attribute 'transform'
# https://stackoverflow.com/questions/64011674/onehotencoder-object-has-no-attribute-transform



+---+---+---------+------+---------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+----------+-----------------+-------------+
|  x|age|workclass|fnlwgt|education|educational-num|    marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|age_square|workclass_encoded|workclass_vec|
+---+---+---------+------+---------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+----------+-----------------+-------------+
|  1| 25|  Private|226802|     11th|              7|     Never-married|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|     625.0|              0.0|(9,[0],[1.0])|
|  2| 38|  Private| 89814|  HS-grad|              9|Married-civ-spouse|  Farming-fishing|     Husband|White|  Ma

In [27]:
from pyspark.ml import Pipeline
# from pyspark.ml.feature import OneHotEncoderEstimator  # error
from pyspark.ml.feature import OneHotEncoder  # error

CATE_FEATURES = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
stages = [] # stages in our Pipeline
for categoricalCol in CATE_FEATURES:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

# error
# ImportError: cannot import name 'OneHotEncoderEstimator' from 'pyspark.ml.feature'
# https://stackoverflow.com/questions/59926511/pyspark-cannot-import-name-onehotencoderestimator

In [28]:
# Convert label into label indices using the StringIndexer
label_stringIdx =  StringIndexer(inputCol="income", outputCol="newlabel")
stages += [label_stringIdx]

In [29]:
assemblerInputs = [c + "classVec" for c in CATE_FEATURES] + CONTI_FEATURES

In [30]:
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [31]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)
model.take(1)

21/10/01 01:43:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Row(x=1, age=25, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K', age_square=625.0, workclassIndex=0.0, workclassclassVec=SparseVector(8, {0: 1.0}), educationIndex=5.0, educationclassVec=SparseVector(15, {5: 1.0}), marital-statusIndex=1.0, marital-statusclassVec=SparseVector(6, {1: 1.0}), occupationIndex=6.0, occupationclassVec=SparseVector(14, {6: 1.0}), relationshipIndex=2.0, relationshipclassVec=SparseVector(5, {2: 1.0}), raceIndex=1.0, raceclassVec=SparseVector(4, {1: 1.0}), genderIndex=0.0, genderclassVec=SparseVector(1, {0: 1.0}), native-countryIndex=0.0, native-countryclassVec=SparseVector(40, {0: 1.0}), newlabel=0.0, features=SparseVector(99, {0: 1.0, 13: 1.0, 24: 1.0, 35: 1.0, 45: 1.0, 49: 1.0, 52: 1.0, 53: 1.0, 93: 25.0, 94: 226802.0, 96

In [32]:
# Step 4) Build the classifier: logistic
from pyspark.ml.linalg import DenseVector
input_data = model.rdd.map(lambda x: (x["newlabel"], DenseVector(x["features"])))

In [33]:
df_train = sqlContext.createDataFrame(input_data, ["income", "features"])
df_train.show(5)



+------+--------------------+
|income|            features|
+------+--------------------+
|   0.0|[1.0,0.0,0.0,0.0,...|
|   0.0|[1.0,0.0,0.0,0.0,...|
|   1.0|[0.0,0.0,1.0,0.0,...|
|   1.0|[1.0,0.0,0.0,0.0,...|
|   0.0|[0.0,0.0,0.0,1.0,...|
+------+--------------------+
only showing top 5 rows





In [34]:
train_data, test_data = df_train.randomSplit([.8,.2],seed=1234)

In [35]:
train_data.groupby('income').agg({'income': 'count'}).show()



+------+-------------+
|income|count(income)|
+------+-------------+
|   0.0|        29701|
|   1.0|         9345|
+------+-------------+





In [36]:
test_data.groupby('income').agg({'income': 'count'}).show()



+------+-------------+
|income|count(income)|
+------+-------------+
|   0.0|         7453|
|   1.0|         2342|
+------+-------------+





In [37]:
# Build the logistic regressor
# Import `LinearRegression`
from pyspark.ml.classification import LogisticRegression

# Initialize `lr`
lr = LogisticRegression(labelCol="income",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.3)

# Fit the data to the model
linearModel = lr.fit(train_data)


21/10/01 01:46:32 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/10/01 01:46:32 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [38]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(linearModel.coefficients))
print("Intercept: " + str(linearModel.intercept))

Coefficients: [-0.06911070238349058,-0.15176282250254972,-0.06228327849871225,-0.16679600132480818,-0.14256163531006674,0.15416862908207532,0.19275169756352675,-0.6086178622558706,-0.18872955332810654,-0.061361598416072395,0.21475282122419911,0.3868033624858021,-0.02730529982708454,-0.31961636351662626,0.00146549871524309,-0.3575281584913024,-0.42868430613449093,0.539858404153372,-0.37864765666040257,-0.2516819189849163,0.5839468619276396,-0.3734267569337355,-0.4314136569784949,0.3303092847789976,-0.3498779877691353,-0.21787296584247065,-0.21835918577681038,-0.15748100291306466,-0.15603754865727332,0.19380159116213586,-0.05988859692033402,0.2876226086737504,-0.12163734056846603,0.04109138685294508,-0.30091167684165815,-0.22965984064600228,-0.16808575370275572,-0.1083909389894253,-0.2723817048245395,-0.3198919311281755,0.08421921092582825,0.12506808142532436,-0.2615040861488797,0.272601395368021,-0.20205684719604047,-0.29366090089557123,-0.24698747189522724,0.41381168407481933,-0.070486

In [39]:
# Step 5) Train and evaluate the model
# Make predictions on test data using the transform() method.
predictions = linearModel.transform(test_data)

In [40]:
predictions.printSchema()

root
 |-- income: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [41]:
selected = predictions.select("income", "prediction", "probability")
selected.show(20)



+------+----------+--------------------+
|income|prediction|         probability|
+------+----------+--------------------+
|   0.0|       0.0|[0.93053780544162...|
|   0.0|       0.0|[0.94603248831733...|
|   0.0|       0.0|[0.81182553768179...|
|   0.0|       0.0|[0.91346022215162...|
|   0.0|       0.0|[0.55398815654016...|
|   0.0|       1.0|[0.28877064190242...|
|   0.0|       1.0|[0.35997365835682...|
|   0.0|       0.0|[0.90778664876446...|
|   0.0|       1.0|[0.44580363687594...|
|   0.0|       1.0|[0.34448851921899...|
|   0.0|       0.0|[0.89461993646932...|
|   0.0|       0.0|[0.85109833395399...|
|   0.0|       0.0|[0.84629201450024...|
|   0.0|       0.0|[0.93049325351008...|
|   0.0|       0.0|[0.66600206899805...|
|   0.0|       0.0|[0.75939329677073...|
|   0.0|       0.0|[0.83720480986046...|
|   0.0|       0.0|[0.82666412168637...|
|   0.0|       0.0|[0.80823811902377...|
|   0.0|       0.0|[0.84657848922658...|
+------+----------+--------------------+
only showing top



In [42]:
# Evaluate the model
cm = predictions.select("income", "prediction")
cm.groupby('income').agg({'income': 'count'}).show()



+------+-------------+
|income|count(income)|
+------+-------------+
|   0.0|         7453|
|   1.0|         2342|
+------+-------------+





In [43]:
cm.groupby('prediction').agg({'prediction': 'count'}).show()



+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             8849|
|       1.0|              946|
+----------+-----------------+





In [44]:
cm.filter(cm.income == cm.prediction).count() / cm.count()



0.8250127616130679

In [45]:
def accuracy_m(model): 
    predictions = model.transform(test_data)
    cm = predictions.select("income", "prediction")
    acc = cm.filter(cm.income == cm.prediction).count() / cm.count()
    print("Model accuracy: %.3f%%" % (acc * 100)) 

accuracy_m(model = linearModel)



Model accuracy: 82.501%




In [46]:
# ROC metrics
### Use ROC 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="income")
print(evaluator.evaluate(predictions))
print(evaluator.getMetricName())

# error 
# IllegalArgumentException: label does not exist. Available: income, features, rawPrediction, probability, prediction
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.BinaryClassificationEvaluator.html
# set label column



0.8893620345339743
areaUnderROC


In [48]:
# Step 6) Tune the hyperparameter
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5])
             .build())

In [49]:
from time import time
start_time = time()

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(train_data)
# likely take a fair amount of time
end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)



Time to train model: 718.609 seconds


In [50]:
accuracy_m(model = cvModel)



Model accuracy: 84.911%




In [51]:
bestModel = cvModel.bestModel
bestModel.extractParamMap()

{Param(parent='LogisticRegression_d38ef5edde44', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LogisticRegression_d38ef5edde44', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LogisticRegression_d38ef5edde44', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto',
 Param(parent='LogisticRegression_d38ef5edde44', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LogisticRegression_d38ef5edde44', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LogisticRegression_d38ef5edde44', name='labelCol', doc='label column name.'): 'income',
 Param(parent='LogisticRegression_d38ef5edde44', name='maxBlockSizeInMB', doc='maximum memory in MB for 