we can use MultilayerPerceptronClassifier. It is super easy to use, just Import it from pyspark.ml.classification. It is based on the feedforward artificial neural network. Nodes in the input layer represent the input data. The rest of the nodes map the inputs to the outputs by a linear combination of the inputs with the node’s weights w and bias b and applying activation functions.
Features :
1. It is based on the multi-layer perceptron.
2. Sigmoid activation function is used in each Layer and Softmax activation function is used in the output layer.
3. It uses logistic loss function for optimization and Solver L-BFGS for optimization routine. but You can another solver like gd.

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Imputer, MinMaxScaler
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import *

In [3]:
spark = SparkSession\
        .builder\
        .appName("DecisionTreeWithSpark")\
        .getOrCreate()

In [6]:
dataset = spark.read.csv("data_banknote_authentication.csv",header=True)
dataset.show()
dataset.printSchema()

+---------+---------+---------+---------+------+
|feature_1|feature_2|feature_3|feature_4|Class |
+---------+---------+---------+---------+------+
|   3.6216|   8.6661|  -2.8073| -0.44699|     0|
|   4.5459|   8.1674|  -2.4586|  -1.4621|     0|
|    3.866|  -2.6383|   1.9242|  0.10645|     0|
|   3.4566|   9.5228|  -4.0112|  -3.5944|     0|
|  0.32924|  -4.4552|   4.5718|  -0.9888|     0|
|       na|   9.6718|  -3.9606|  -3.1625|     0|
|   3.5912|   3.0129|  0.72888|  0.56421|     0|
|   2.0922|    -6.81|   8.4636| -0.60216|     0|
|   3.2032|   5.7588| -0.75345| -0.61251|     0|
|   1.5356|   9.1772|  -2.2718| -0.73535|     0|
|   1.2247|   8.7779|  -2.2135| -0.80647|     0|
|   3.9899|  -2.7066|   2.3946|  0.86291|     0|
|   1.8993|   7.6625|  0.15394|  -3.1108|     0|
|  -1.5768|   10.843|   2.5462|  -2.9362|     0|
|    3.404|   8.7261|  -2.9915| -0.57242|     0|
|   4.6765|  -3.3895|   3.4896|   1.4771|     0|
|   2.6719|   3.0646|  0.37158|  0.58619|     0|
|  0.80355|   2.8473

In [7]:
for col in dataset.columns:
    dataset = dataset.withColumn(col,dataset[col].cast('double'))

In [8]:
input_cols = dataset.columns[:-1]

In [9]:
imputed_col = ['f_{}'.format(i+1) for i in range(len(input_cols))]

model = Imputer(strategy='mean',missingValue=None,inputCols=input_cols,outputCols=imputed_col).fit(dataset)
impute_data = model.transform(dataset)

In [10]:
impute_data.show()

+---------+---------+---------+---------+------+------------------+------------------+--------+--------+
|feature_1|feature_2|feature_3|feature_4|Class |               f_1|               f_2|     f_3|     f_4|
+---------+---------+---------+---------+------+------------------+------------------+--------+--------+
|   3.6216|   8.6661|  -2.8073| -0.44699|   0.0|            3.6216|            8.6661| -2.8073|-0.44699|
|   4.5459|   8.1674|  -2.4586|  -1.4621|   0.0|            4.5459|            8.1674| -2.4586| -1.4621|
|    3.866|  -2.6383|   1.9242|  0.10645|   0.0|             3.866|           -2.6383|  1.9242| 0.10645|
|   3.4566|   9.5228|  -4.0112|  -3.5944|   0.0|            3.4566|            9.5228| -4.0112| -3.5944|
|  0.32924|  -4.4552|   4.5718|  -0.9888|   0.0|           0.32924|           -4.4552|  4.5718| -0.9888|
|     null|   9.6718|  -3.9606|  -3.1625|   0.0|0.4308653338439095|            9.6718| -3.9606| -3.1625|
|   3.5912|   3.0129|  0.72888|  0.56421|   0.0|       

In [11]:
assembler = VectorAssembler(inputCols=imputed_col, outputCol='assembled_features', handleInvalid='error')
a_data = assembler.transform(impute_data)

In [12]:
scaler = MinMaxScaler(min=0.0, max=1.0, inputCol='assembled_features', outputCol='features')
s_data = scaler.fit(a_data).transform(a_data)

In [13]:
display(s_data)

DataFrame[feature_1: double, feature_2: double, feature_3: double, feature_4: double, Class : double, f_1: double, f_2: double, f_3: double, f_4: double, assembled_features: vector, features: vector]

In [14]:
s_data = s_data.withColumnRenamed('Class','label')

In [15]:
a = ['Class ','features']
s_data.select(*a).show()

+------+--------------------+
|Class |            features|
+------+--------------------+
|   0.0|[0.76900388695382...|
|   0.0|[0.83565901535310...|
|   0.0|[0.78662859038429...|
|   0.0|[0.75710504871312...|
|   0.0|[0.53157807440740...|
|   0.0|[0.53890670112598...|
|   0.0|[0.76681161615068...|
|   0.0|[0.65871247358818...|
|   0.0|[0.73883131774224...|
|   0.0|[0.61857372592288...|
|   0.0|[0.59615343011055...|
|   0.0|[0.79556353619049...|
|   0.0|[0.64480164997223...|
|   0.0|[0.39412557961765...|
|   0.0|[0.75331184331032...|
|   0.0|[0.84507712610605...|
|   0.0|[0.70051705860718...|
|   0.0|[0.56578254692829...|
|   0.0|[0.61224931311251...|
|   0.0|[0.88587932414598...|
+------+--------------------+
only showing top 20 rows



In [16]:
col = ['Class ','features']
s_data.select(*col).show()

+------+--------------------+
|Class |            features|
+------+--------------------+
|   0.0|[0.76900388695382...|
|   0.0|[0.83565901535310...|
|   0.0|[0.78662859038429...|
|   0.0|[0.75710504871312...|
|   0.0|[0.53157807440740...|
|   0.0|[0.53890670112598...|
|   0.0|[0.76681161615068...|
|   0.0|[0.65871247358818...|
|   0.0|[0.73883131774224...|
|   0.0|[0.61857372592288...|
|   0.0|[0.59615343011055...|
|   0.0|[0.79556353619049...|
|   0.0|[0.64480164997223...|
|   0.0|[0.39412557961765...|
|   0.0|[0.75331184331032...|
|   0.0|[0.84507712610605...|
|   0.0|[0.70051705860718...|
|   0.0|[0.56578254692829...|
|   0.0|[0.61224931311251...|
|   0.0|[0.88587932414598...|
+------+--------------------+
only showing top 20 rows



In [17]:
train_df,test_df = s_data.select('Class ','features').randomSplit([0.7,0.3],1213)

In [18]:
test_df.show()

+------+--------------------+
|Class |            features|
+------+--------------------+
|   0.0|[0.19876107853954...|
|   0.0|[0.31320626816375...|
|   0.0|[0.32870360354513...|
|   0.0|[0.33488378801318...|
|   0.0|[0.36600826428401...|
|   0.0|[0.37781335410221...|
|   0.0|[0.38120993156365...|
|   0.0|[0.38276038624350...|
|   0.0|[0.38558726175280...|
|   0.0|[0.39912309167874...|
|   0.0|[0.39926732002105...|
|   0.0|[0.41408678219356...|
|   0.0|[0.41408678219356...|
|   0.0|[0.42568995233253...|
|   0.0|[0.42625244286754...|
|   0.0|[0.42711781292141...|
|   0.0|[0.44169352919542...|
|   0.0|[0.45695144552856...|
|   0.0|[0.47533911688986...|
|   0.0|[0.47912222630869...|
+------+--------------------+
only showing top 20 rows



In [19]:
mlpc=MultilayerPerceptronClassifier( featuresCol='features',labelCol='Class ',layers = [4,16,2],maxIter=1000,blockSize=8,seed=7,solver='gd')
#mlpc=MultilayerPerceptronClassifier( featuresCol='features',labelCol='Class ',layers = [4,32,16,2],maxIter=1500,blockSize=32,seed=7,solver='gd')
#mlpc=MultilayerPerceptronClassifier( featuresCol='features',labelCol='Class ',layers = [4,16,2],maxIter=500,blockSize=8,seed=7,solver='gd')

In [20]:
ann = mlpc.fit(train_df)

In [21]:
pred = ann.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol='Class ',predictionCol='prediction',metricName='f1')
ann_f1 = evaluator.evaluate(pred)
ann_f1

0.8309986610958963

In [22]:
pred

DataFrame[Class : double, features: vector, rawPrediction: vector, probability: vector, prediction: double]