In [None]:
‘’

# OneHotEncoder

```python
class pyspark.ml.feature.OneHotEncoder(
    *, inputCols=None, outputCols=None, handleInvalid='error',
    dropLast=True, inputCol=None, outputCol=None)
```

A one-hot encoder that maps a column of category indices to a column of binary vectors, with at most a single one-value per row that indicates the input category index. For example with 5 categories, an input value of 2.0 would map to an output vector of [0.0, 0.0, 1.0, 0.0]. The last category is not included by default (configurable via dropLast), because it makes the vector entries sum up to one, and hence linearly dependent. So an input value of 4.0 maps to [0.0, 0.0, 0.0, 0.0].

When handleInvalid is configured to ‘keep’, an extra “category” indicating invalid values is added as last category. So when dropLast is true, invalid values are encoded as all-zeros vector.

StringIndexer
for converting categorical values into category indices

Notes

This is different from scikit-learn’s OneHotEncoder, which keeps all categories. The output vectors are sparse.

When encoding multi-column by using inputCols and outputCols params, input/output cols come in pairs, specified by the order in the arrays, and each pair is treated independently.

In [23]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer

In [17]:
df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
ohe = OneHotEncoder(
    inputCols=['input'],
    outputCols=['output'])

In [18]:
model = ohe.fit(df)

In [19]:
df = model.transform(df)

In [20]:
df.printSchema()

root
 |-- input: double (nullable = true)
 |-- output: vector (nullable = true)



In [21]:
df.show()

+-----+-------------+
|input|       output|
+-----+-------------+
|  0.0|(2,[0],[1.0])|
|  1.0|(2,[1],[1.0])|
|  2.0|    (2,[],[])|
+-----+-------------+



In [10]:
df.collect()

[Row(input=0.0, output=SparseVector(2, {0: 1.0})),
 Row(input=1.0, output=SparseVector(2, {1: 1.0})),
 Row(input=2.0, output=SparseVector(2, {}))]

In [None]:
model.setOutputCols(["output"])

model.getHandleInvalid()

model.transform(df).head().output

single_col_ohe = OneHotEncoder(inputCol="input", outputCol="output")
single_col_model = single_col_ohe.fit(df)
single_col_model.transform(df).head().output

ohePath = temp_path + "/ohe"
ohe.save(ohePath)
loadedOHE = OneHotEncoder.load(ohePath)
loadedOHE.getInputCols() == ohe.getInputCols()

modelPath = temp_path + "/ohe-model"
model.save(modelPath)
loadedModel = OneHotEncoderModel.load(modelPath)
loadedModel.categorySizes == model.categorySizes

loadedModel.transform(df).take(1) == model.transform(df).take(1)

In [26]:
df = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "a"),
    (5, "c"),
    (6, "d"),
], ["id", "value"])

stringIndexer = StringIndexer(inputCol="value", outputCol="valueIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="valueIndex", outputCol="valueIndexVec")\
    .fit(indexed)
encoded = encoder.transform(indexed)
encoded.show()

+---+-----+----------+-------------+
| id|value|valueIndex|valueIndexVec|
+---+-----+----------+-------------+
|  0|    a|       0.0|(3,[0],[1.0])|
|  1|    b|       2.0|(3,[2],[1.0])|
|  2|    c|       1.0|(3,[1],[1.0])|
|  3|    a|       0.0|(3,[0],[1.0])|
|  4|    a|       0.0|(3,[0],[1.0])|
|  5|    c|       1.0|(3,[1],[1.0])|
|  6|    d|       3.0|    (3,[],[])|
+---+-----+----------+-------------+



# StringIndexer

```python
class pyspark.ml.feature.StringIndexer(
    *, inputCol=None, outputCol=None, inputCols=None, 
    outputCols=None, handleInvalid='error', 
    stringOrderType='frequencyDesc')
```

A label indexer that maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values. The indices are in [0, numLabels). By default, this is ordered by label frequencies so the most frequent label gets index 0. The ordering behavior is controlled by setting stringOrderType. Its default value is ‘frequencyDesc’.

In [None]:
stringIndexer = StringIndexer(
    inputCol="label", outputCol="indexed",
    stringOrderType="frequencyDesc",
    handleInvalid='error')

handleInvalid = Param(parent='undefined', name='handleInvalid', doc="how to handle invalid data (unseen or NULL values) in features and label column of string type. Options are 'skip' (filter out rows with invalid data), error (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels).")¶

stringOrderType = Param(parent='undefined', name='stringOrderType', doc='How to order labels of string column. The first label after ordering is assigned an index of 0. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. Default is frequencyDesc. In case of equal frequency when under frequencyDesc/Asc, the strings are further sorted alphabetically')

In [None]:
td = model.transform(stringIndDf)
sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
    key=lambda x: x[0])

In [None]:
inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels)
itd = inverter.transform(td)
sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
    key=lambda x: x[0])

stringIndexerPath = temp_path + "/string-indexer"
stringIndexer.save(stringIndexerPath)
loadedIndexer = StringIndexer.load(stringIndexerPath)
loadedIndexer.getHandleInvalid() == stringIndexer.getHandleInvalid()

modelPath = temp_path + "/string-indexer-model"
model.save(modelPath)
loadedModel = StringIndexerModel.load(modelPath)
loadedModel.labels == model.labels

indexToStringPath = temp_path + "/index-to-string"
inverter.save(indexToStringPath)
loadedInverter = IndexToString.load(indexToStringPath)
loadedInverter.getLabels() == inverter.getLabels()

loadedModel.transform(stringIndDf).take(1) == model.transform(stringIndDf).take(1)

stringIndexer.getStringOrderType()

stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error",
    stringOrderType="alphabetDesc")
model = stringIndexer.fit(stringIndDf)
td = model.transform(stringIndDf)
sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
    key=lambda x: x[0])

fromlabelsModel = StringIndexerModel.from_labels(["a", "b", "c"],
    inputCol="label", outputCol="indexed", handleInvalid="error")
result = fromlabelsModel.transform(stringIndDf)
sorted(set([(i[0], i[1]) for i in result.select(result.id, result.indexed).collect()]),
    key=lambda x: x[0])

testData = sc.parallelize([Row(id=0, label1="a", label2="e"),
                           Row(id=1, label1="b", label2="f"),
                           Row(id=2, label1="c", label2="e"),
                           Row(id=3, label1="a", label2="f"),
                           Row(id=4, label1="a", label2="f"),
                           Row(id=5, label1="c", label2="f")], 3)
multiRowDf = spark.createDataFrame(testData)
inputs = ["label1", "label2"]
outputs = ["index1", "index2"]
stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs)
model = stringIndexer.fit(multiRowDf)
result = model.transform(multiRowDf)
sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,
    result.index2).collect()]), key=lambda x: x[0])

fromlabelsModel = StringIndexerModel.from_arrays_of_labels([["a", "b", "c"], ["e", "f"]],
    inputCols=inputs, outputCols=outputs)
result = fromlabelsModel.transform(multiRowDf)
sorted(set([(i[0], i[1], i[2]) for i in result.select(result.id, result.index1,
    result.index2).collect()]), key=lambda x: x[0])
