# Sklearn and Spark Comparison

In [1]:
from pyspark.sql import SparkSession

# Start a SparkSession
spark = SparkSession.builder.master("local[5]").appName("MySparkApp").getOrCreate()

## Data Processing

Data is sourced from https://archive.ics.uci.edu/dataset/2/adult

Steps:
- Load data into spark dataframe
- Preprocess data (fill na)
- Create embedings for categorical values
- Assemble into a single feature vector

### Load Data

In [2]:
# Read data into Spark DataFrame
df = spark.read.csv('data_a/adult.data', header=False, inferSchema=True)

# Fill NA
df = df.fillna(0)

df.show(5)

+---+-----------------+--------+----------+----+-------------------+------------------+--------------+------+-------+------+----+----+--------------+------+
|_c0|              _c1|     _c2|       _c3| _c4|                _c5|               _c6|           _c7|   _c8|    _c9|  _c10|_c11|_c12|          _c13|  _c14|
+---+-----------------+--------+----------+----+-------------------+------------------+--------------+------+-------+------+----+----+--------------+------+
| 39|        State-gov| 77516.0| Bachelors|13.0|      Never-married|      Adm-clerical| Not-in-family| White|   Male|2174.0| 0.0|40.0| United-States| <=50K|
| 50| Self-emp-not-inc| 83311.0| Bachelors|13.0| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|   0.0| 0.0|13.0| United-States| <=50K|
| 38|          Private|215646.0|   HS-grad| 9.0|           Divorced| Handlers-cleaners| Not-in-family| White|   Male|   0.0| 0.0|40.0| United-States| <=50K|
| 53|          Private|234721.0|      11th| 7.0| Married-c

In [3]:
# Define the features and label columns
feature_cols = df.columns[:-1]
label_col = df.columns[-1]

print("Feature columns: ", feature_cols)
print("Label column: ", label_col)

Feature columns:  ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13']
Label column:  _c14


### Get embeddings

In [4]:
# Dependencies
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

In [5]:
# String indexers for categorical columns
str_cols = [col for col in feature_cols if df.select(col).dtypes[0][1] == 'string']
feature_indexers = [StringIndexer(inputCol=col, outputCol=col+'_index') for col in str_cols]

# Get new feature column names
feature_cols_indexed = [indexer.getOutputCol() for indexer in feature_indexers] + [col for col in feature_cols if col not in str_cols]

In [6]:
# String indexer for label
labelIndexer = StringIndexer(inputCol=label_col, outputCol="indexedLabel")

In [7]:
# Transform dataframe
df_indexed = Pipeline(stages=feature_indexers+[labelIndexer]).fit(df).transform(df)

In [8]:
df_indexed.select("indexedLabel", *feature_cols_indexed).show(5, truncate=False)

+------------+---------+---------+---------+---------+---------+---------+---------+----------+---+--------+----+------+----+----+
|indexedLabel|_c1_index|_c3_index|_c5_index|_c6_index|_c7_index|_c8_index|_c9_index|_c13_index|_c0|_c2     |_c4 |_c10  |_c11|_c12|
+------------+---------+---------+---------+---------+---------+---------+---------+----------+---+--------+----+------+----+----+
|0.0         |4.0      |2.0      |1.0      |3.0      |1.0      |0.0      |0.0      |0.0       |39 |77516.0 |13.0|2174.0|0.0 |40.0|
|0.0         |1.0      |2.0      |0.0      |2.0      |0.0      |0.0      |0.0      |0.0       |50 |83311.0 |13.0|0.0   |0.0 |13.0|
|0.0         |0.0      |0.0      |2.0      |9.0      |1.0      |0.0      |0.0      |0.0       |38 |215646.0|9.0 |0.0   |0.0 |40.0|
|0.0         |0.0      |5.0      |0.0      |9.0      |0.0      |1.0      |0.0      |0.0       |53 |234721.0|7.0 |0.0   |0.0 |40.0|
|0.0         |0.0      |2.0      |0.0      |0.0      |4.0      |1.0      |1.0      

In [9]:
# Set all columns to integer type
for column_name in feature_cols_indexed + ["indexedLabel"]:
    df_indexed = df_indexed.withColumn(column_name, col(column_name).cast(IntegerType()))

In [10]:
df_indexed.select("indexedLabel", *feature_cols_indexed).show(5, truncate=False)

+------------+---------+---------+---------+---------+---------+---------+---------+----------+---+------+---+----+----+----+
|indexedLabel|_c1_index|_c3_index|_c5_index|_c6_index|_c7_index|_c8_index|_c9_index|_c13_index|_c0|_c2   |_c4|_c10|_c11|_c12|
+------------+---------+---------+---------+---------+---------+---------+---------+----------+---+------+---+----+----+----+
|0           |4        |2        |1        |3        |1        |0        |0        |0         |39 |77516 |13 |2174|0   |40  |
|0           |1        |2        |0        |2        |0        |0        |0        |0         |50 |83311 |13 |0   |0   |13  |
|0           |0        |0        |2        |9        |1        |0        |0        |0         |38 |215646|9  |0   |0   |40  |
|0           |0        |5        |0        |9        |0        |1        |0        |0         |53 |234721|7  |0   |0   |40  |
|0           |0        |2        |0        |0        |4        |1        |1        |9         |28 |338409|13 |0   |0  

### Assemble

In [11]:
assembler = VectorAssembler(inputCols=feature_cols_indexed, outputCol="features")

In [12]:
df_assembled = assembler.transform(df_indexed).select("features", "indexedLabel")

In [13]:
df_assembled.show(5, truncate=False)

+-------------------------------------------------------------------+------------+
|features                                                           |indexedLabel|
+-------------------------------------------------------------------+------------+
|[4.0,2.0,1.0,3.0,1.0,0.0,0.0,0.0,39.0,77516.0,13.0,2174.0,0.0,40.0]|0           |
|(14,[0,1,3,8,9,10,13],[1.0,2.0,2.0,50.0,83311.0,13.0,13.0])        |0           |
|(14,[2,3,4,8,9,10,13],[2.0,9.0,1.0,38.0,215646.0,9.0,40.0])        |0           |
|(14,[1,3,5,8,9,10,13],[5.0,9.0,1.0,53.0,234721.0,7.0,40.0])        |0           |
|[0.0,2.0,0.0,0.0,4.0,1.0,1.0,9.0,28.0,338409.0,13.0,0.0,0.0,40.0]  |0           |
+-------------------------------------------------------------------+------------+
only showing top 5 rows



In [14]:
# Init dataframe
df_train = df_assembled.alias('df_train')

# Bloat dataframe
for _ in range(250):
    df_train = df_train.union(df_assembled)

## Cross validation spark

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import time

# Create a DecisionTree model
tree = DecisionTreeClassifier(labelCol='indexedLabel', featuresCol='features')

# Create a pipeline with the DecisionTree model
pipeline = Pipeline(stages=[tree])

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(tree.maxDepth, [1, 5, 10, 20]) \
    .build()

# Create a CrossValidator with a 5-fold cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='accuracy')
crossval = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)

dt = time.time()
crossval = crossval.fit(df_train)
dt = time.time() - dt

print(f'Performance MLlib: {dt} seconds')

## Cross validation sklearn

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import time

# Create the DecisionTree model
tree = DecisionTreeClassifier()

# Separate your features (X) and target variable (y)
X = df_train.drop('indexedLabel').collect()
y = df_train['indexedLabel'].values

# Fit the model to the data and calculate performance
dt = time.time()
scores = cross_val_score(tree, X, y, cv=5, scoring='accuracy')
dt = time.time() - dt

print(f'Performance scikit-learn: {dt} seconds')