# Lab 1C - Encoding Categorical Features in Spark

In [1]:
import findspark
print(findspark.find())
findspark.init('C:\Spark\spark-3.0.3-bin-hadoop2.7') 

C:\Spark\spark-3.0.3-bin-hadoop2.7


In [7]:
#from pyspark import SparkContext
from pyspark.sql.session import SparkSession

##sc1 = SparkContext("local","Lab-01_C_Encoding Categorical Features")
sc1 = SparkSession.builder.appName("Lab-01_C_Encoding_categorical_features").getOrCreate()
sc1

In [10]:
from pyspark.ml.feature import StringIndexer

In [11]:
df = sc1.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



# StringIndexer <br>
StringIndexer encodes a string column of labels to a column of label indices. <br>
Four ordering options are supported: <br>
1. “frequencyDesc”: descending order by label frequency (most frequent label assigned 0), 
2. “frequencyAsc”: ascending order by label frequency (least frequent label assigned 0), 
3. “alphabetDesc”: descending alphabetical order, and 
4. “alphabetAsc”: ascending alphabetical order (default = “frequencyDesc”). <br>

Note that in case of equal frequency when under “frequencyDesc”/”frequencyAsc”, the strings are further sorted by alphabet.

In [13]:
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex", stringOrderType='alphabetDesc')
indexed = indexer.fit(df).transform(df)
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          2.0|
|  1|       b|          1.0|
|  2|       c|          0.0|
|  3|       a|          2.0|
|  4|       a|          2.0|
|  5|       c|          0.0|
+---+--------+-------------+



# OneHotEncoder <br>

One-hot encoding maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value indicating the presence of a specific feature value from among the set of all feature values.

For string type input data, it is common to encode categorical features using StringIndexer first.

In [14]:
from pyspark.ml.feature import OneHotEncoder

In [16]:
df = sc1.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

df.show()

+--------------+--------------+
|categoryIndex1|categoryIndex2|
+--------------+--------------+
|           0.0|           1.0|
|           1.0|           0.0|
|           2.0|           1.0|
|           0.0|           2.0|
|           0.0|           1.0|
|           2.0|           0.0|
+--------------+--------------+



In [23]:
encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                        outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)


The output comprises of 3 values. <br>
1. First value indicates the length of the vector.
2. Second value indicates an array of indices or positions where non zero entries are found.
3. Third value indicates an array that tells which numbers are found in the indices indicated by the array in 2.

<br>
Example: (2, [1], [1.0]) denotes the vector is of length '2' (two), has a value of 1 present at the index 1 or location 1. Therefore, the one hot vector is '01'

In [29]:
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [33]:
encoder1 = OneHotEncoder(inputCol='categoryIndex', outputCol='One-hot-vector')
model1 = encoder1.fit(indexed)
encoded1 = model1.transform(indexed)

In [36]:
encoded1.show()

+---+--------+-------------+--------------+
| id|category|categoryIndex|One-hot-vector|
+---+--------+-------------+--------------+
|  0|       a|          2.0|     (2,[],[])|
|  1|       b|          1.0| (2,[1],[1.0])|
|  2|       c|          0.0| (2,[0],[1.0])|
|  3|       a|          2.0|     (2,[],[])|
|  4|       a|          2.0|     (2,[],[])|
|  5|       c|          0.0| (2,[0],[1.0])|
+---+--------+-------------+--------------+



In [37]:
sc1.stop()