# Spark ML Preprocessing

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml
import pyspark.sql.functions as spark_f
import pyspark.sql.types as spark_types
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('Spark Test App').getOrCreate() 
sc = spark.sparkContext

In [4]:
# read some test data
df = spark.read.csv('../Stat_Learning/data/Wage.csv', header=True)
df=df.drop('_c0')
df.toPandas().head()

Unnamed: 0,year,age,sex,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Male,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.31806333496276,75.0431540173515
1,2004,24,1. Male,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.25527250510331,70.4760196469445
2,2003,45,1. Male,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.8750612633917,130.982177377461
3,2003,43,1. Male,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.04139268515823,154.68529299563
4,2005,50,1. Male,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.31806333496276,75.0431540173515


In [5]:
df.printSchema()

root
 |-- year: string (nullable = true)
 |-- age: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- maritl: string (nullable = true)
 |-- race: string (nullable = true)
 |-- education: string (nullable = true)
 |-- region: string (nullable = true)
 |-- jobclass: string (nullable = true)
 |-- health: string (nullable = true)
 |-- health_ins: string (nullable = true)
 |-- logwage: string (nullable = true)
 |-- wage: string (nullable = true)



### SQL Transformers

In [12]:
# Custom SQL Code to be used as transformation in ML pipeline
# Simple Example: Variable Selection and Type Casting
sql_transf=ml.feature.SQLTransformer(statement="""
    select cast(year as int) as year,
            cast(age as int) as age,
            sex,
            race,
            education,
            cast(wage as double) as wage
    from __THIS__
""")
df1=sql_transf.transform(df)
df1.toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154
1,2004,24,1. Male,1. White,4. College Grad,70.47602
2,2003,45,1. Male,1. White,3. Some College,130.982177
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293
4,2005,50,1. Male,1. White,2. HS Grad,75.043154


In [13]:
df1.printSchema()

root
 |-- year: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- race: string (nullable = true)
 |-- education: string (nullable = true)
 |-- wage: double (nullable = true)



### RFormula Transformer

In [15]:
# R Formula transformations (High level transformation for convenience)
# Note: Categorical Variables should be strings, measures should be numeric
r_transf=ml.feature.RFormula(formula='wage ~ age')
r_transf.fit(df1).transform(df1).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,features,label
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,[18.0],75.043154
1,2004,24,1. Male,1. White,4. College Grad,70.47602,[24.0],70.47602
2,2003,45,1. Male,1. White,3. Some College,130.982177,[45.0],130.982177
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,[43.0],154.685293
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,[50.0],75.043154


In [20]:
r_transf=ml.feature.RFormula(formula='wage ~ age + race')
r_transf.fit(df1).transform(df1).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,features,label
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,"[18.0, 1.0, 0.0, 0.0]",75.043154
1,2004,24,1. Male,1. White,4. College Grad,70.47602,"[24.0, 1.0, 0.0, 0.0]",70.47602
2,2003,45,1. Male,1. White,3. Some College,130.982177,"[45.0, 1.0, 0.0, 0.0]",130.982177
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,"[43.0, 0.0, 0.0, 1.0]",154.685293
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,"[50.0, 1.0, 0.0, 0.0]",75.043154


### Vector Assembler

In [29]:
# Gather columns into a vector
vec_assembler=ml.feature.VectorAssembler(inputCols=['age', 'year'], outputCol='features')
vec_assembler.transform(df1).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,features
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,"[18.0, 2006.0]"
1,2004,24,1. Male,1. White,4. College Grad,70.47602,"[24.0, 2004.0]"
2,2003,45,1. Male,1. White,3. Some College,130.982177,"[45.0, 2003.0]"
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,"[43.0, 2003.0]"
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,"[50.0, 2005.0]"


### Binning

In [30]:
# Binning age
bin_transf=ml.feature.Bucketizer(splits=[0, 19, 36, 61, float('inf')], inputCol='age', outputCol='age_binned')
bin_transf.transform(df1).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,age_binned
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,0.0
1,2004,24,1. Male,1. White,4. College Grad,70.47602,1.0
2,2003,45,1. Male,1. White,3. Some College,130.982177,2.0
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,2.0
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,2.0


### Quantile Discretization

In [28]:
# Create wage groups by quantiles
qd_transf=ml.feature.QuantileDiscretizer(numBuckets=4, inputCol='wage', outputCol='wage_group')
qd_transf.fit(df1).transform(df1).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,wage_group
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,0.0
1,2004,24,1. Male,1. White,4. College Grad,70.47602,0.0
2,2003,45,1. Male,1. White,3. Some College,130.982177,3.0
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,3.0
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,0.0


### Scaling

In [34]:
# Scaler works on vectors
df_in=vec_transf.transform(df1)
scaler=ml.feature.StandardScaler(inputCol='features', outputCol='features_std', withMean=True)
scaler.fit(df_in).transform(df_in).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,features,features_std
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,"[18.0, 2006.0]","[-2.115214756065826, 0.10315041366624039]"
1,2004,24,1. Male,1. White,4. College Grad,70.47602,"[24.0, 2004.0]","[-1.5953924414846345, -0.8839348845760848]"
2,2003,45,1. Male,1. White,3. Some College,130.982177,"[45.0, 2003.0]","[0.22398565954953556, -1.3774775336972473]"
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,"[43.0, 2003.0]","[0.05071155468913839, -1.3774775336972473]"
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,"[50.0, 2005.0]","[0.6571709217005285, -0.3903922354549222]"


### Encode categorical Variables

In [41]:
# Indexing
indexer=ml.feature.StringIndexer(inputCol='race', outputCol='race_index')
df_index=indexer.fit(df1).transform(df1)
df_index.toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,race_index
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,0.0
1,2004,24,1. Male,1. White,4. College Grad,70.47602,0.0
2,2003,45,1. Male,1. White,3. Some College,130.982177,0.0
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,2.0
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,0.0


In [42]:
# Converting index back
index_to_string=ml.feature.IndexToString(inputCol='race_index', outputCol='race_text')
index_to_string.transform(df_index).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,race_index,race_text
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,0.0,1. White
1,2004,24,1. Male,1. White,4. College Grad,70.47602,0.0,1. White
2,2003,45,1. Male,1. White,3. Some College,130.982177,0.0,1. White
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,2.0,3. Asian
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,0.0,1. White


In [45]:
# Indexing adequate for ordinal, categorical variables, but in this case (race) second step necessary: One Hot Encoding
oh_encode=ml.feature.OneHotEncoder(inputCol='race_index', outputCol='race_enc')
oh_encode.transform(df_index).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,race_index,race_enc
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,0.0,"(1.0, 0.0, 0.0)"
1,2004,24,1. Male,1. White,4. College Grad,70.47602,0.0,"(1.0, 0.0, 0.0)"
2,2003,45,1. Male,1. White,3. Some College,130.982177,0.0,"(1.0, 0.0, 0.0)"
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,2.0,"(0.0, 0.0, 1.0)"
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,0.0,"(1.0, 0.0, 0.0)"


### Polynomial Expansion

In [48]:
# Scaler works on vectors
df_in=vec_transf.transform(df1)
poly2=ml.feature.PolynomialExpansion(degree=2, inputCol='features', outputCol='features_poly')
poly2.transform(df_in).toPandas().head()

Unnamed: 0,year,age,sex,race,education,wage,features,features_poly
0,2006,18,1. Male,1. White,1. < HS Grad,75.043154,"[18.0, 2006.0]","[18.0, 324.0, 2006.0, 36108.0, 4024036.0]"
1,2004,24,1. Male,1. White,4. College Grad,70.47602,"[24.0, 2004.0]","[24.0, 576.0, 2004.0, 48096.0, 4016016.0]"
2,2003,45,1. Male,1. White,3. Some College,130.982177,"[45.0, 2003.0]","[45.0, 2025.0, 2003.0, 90135.0, 4012009.0]"
3,2003,43,1. Male,3. Asian,4. College Grad,154.685293,"[43.0, 2003.0]","[43.0, 1849.0, 2003.0, 86129.0, 4012009.0]"
4,2005,50,1. Male,1. White,2. HS Grad,75.043154,"[50.0, 2005.0]","[50.0, 2500.0, 2005.0, 100250.0, 4020025.0]"
