# Sklearn and Spark Comparison

In [None]:
from pyspark.sql import SparkSession

# Start a SparkSession
spark = SparkSession.builder.master("local[5]").appName("MySparkApp").getOrCreate()

## Data Processing

Data is sourced from https://archive.ics.uci.edu/dataset/2/adult

Steps:
- Load data into spark dataframe
- Preprocess data (fill na)
- Create embedings for categorical values
- Assemble into a single feature vector

### Load Data

In [None]:
# Read data into Spark DataFrame
df = spark.read.csv('data_a/adult.data', header=False, inferSchema=True)

# Fill NA
df = df.fillna(0)

df.show(5)

In [None]:
# Define the features and label columns
feature_cols = df.columns[:-1]
label_col = df.columns[-1]

print("Feature columns: ", feature_cols)
print("Label column: ", label_col)

### Get embeddings

In [None]:
# Dependencies
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

In [None]:
# String indexers for categorical columns
str_cols = [col for col in feature_cols if df.select(col).dtypes[0][1] == 'string']
feature_indexers = [StringIndexer(inputCol=col, outputCol=col+'_index') for col in str_cols]

# Get new feature column names
feature_cols_indexed = [indexer.getOutputCol() for indexer in feature_indexers] + [col for col in feature_cols if col not in str_cols]

In [None]:
# String indexer for label
labelIndexer = StringIndexer(inputCol=label_col, outputCol="indexedLabel")

In [None]:
# Transform dataframe
df_indexed = Pipeline(stages=feature_indexers+[labelIndexer]).fit(df).transform(df)

In [None]:
df_indexed.select("indexedLabel", *feature_cols_indexed).show(5, truncate=False)

In [None]:
# Set all columns to integer type
for column_name in feature_cols_indexed + ["indexedLabel"]:
    df_indexed = df_indexed.withColumn(column_name, col(column_name).cast(IntegerType()))

In [None]:
df_indexed.select("indexedLabel", *feature_cols_indexed).show(5, truncate=False)

### Assemble

In [None]:
assembler = VectorAssembler(inputCols=feature_cols_indexed, outputCol="features")

In [None]:
df_assembled = assembler.transform(df_indexed).select("features", "indexedLabel")

In [None]:
df_assembled.show(5, truncate=False)

In [None]:
# Init dataframe
df_train = df_assembled.alias('df_train')

# Bloat dataframe
for _ in range(250):
    df_train = df_train.union(df_assembled)

## Cross validation spark

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import time

# Create a DecisionTree model
tree = DecisionTreeClassifier(labelCol='indexedLabel', featuresCol='features')

# Create a pipeline with the DecisionTree model
pipeline = Pipeline(stages=[tree])

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(tree.maxDepth, [1, 5, 10, 20]) \
    .build()

# Create a CrossValidator with a 5-fold cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='accuracy')
crossval = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)

dt = time.time()
crossval = crossval.fit(df_train)
dt = time.time() - dt

print(f'Performance MLlib: {dt} seconds')

## Cross validation sklearn

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import time

# Create the DecisionTree model
tree = DecisionTreeClassifier()

# Separate your features (X) and target variable (y)
X = df_train.drop('indexedLabel').collect()
y = df_train['indexedLabel'].values

# Fit the model to the data and calculate performance
dt = time.time()
scores = cross_val_score(tree, X, y, cv=5, scoring='accuracy')
dt = time.time() - dt

print(f'Performance scikit-learn: {dt} seconds')