In [3]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
df=spark.read.csv('cancerdata_test.csv',header=True,inferSchema=True)
df=df.drop('_c32').drop('id')
df.printSchema()

root
 |-- diagnosis: string (nullable = true)
 |-- radius_mean: double (nullable = true)
 |-- texture_mean: double (nullable = true)
 |-- perimeter_mean: double (nullable = true)
 |-- area_mean: double (nullable = true)
 |-- smoothness_mean: double (nullable = true)
 |-- compactness_mean: double (nullable = true)
 |-- concavity_mean: double (nullable = true)
 |-- concave points_mean: double (nullable = true)
 |-- symmetry_mean: double (nullable = true)
 |-- fractal_dimension_mean: double (nullable = true)
 |-- radius_se: double (nullable = true)
 |-- texture_se: double (nullable = true)
 |-- perimeter_se: double (nullable = true)
 |-- area_se: double (nullable = true)
 |-- smoothness_se: double (nullable = true)
 |-- compactness_se: double (nullable = true)
 |-- concavity_se: double (nullable = true)
 |-- concave points_se: double (nullable = true)
 |-- symmetry_se: double (nullable = true)
 |-- fractal_dimension_se: double (nullable = true)
 |-- radius_worst: double (nullable = true)


In [2]:
from pyspark.ml.feature import VectorAssembler
va=VectorAssembler(inputCols=([c for c in df.columns if c!='diagnosis']),outputCol='features')

In [10]:
vaDf=va.transform(df)

vaDf.select('features','diagnosis').show()

+--------------------+---------+
|            features|diagnosis|
+--------------------+---------+
|[17.99,10.38,122....|        M|
|[20.57,17.77,132....|        M|
|[19.69,21.25,130....|        M|
|[11.42,20.38,77.5...|        M|
|[20.29,14.34,135....|        M|
|[12.45,15.7,82.57...|        M|
|[18.25,19.98,119....|        M|
|[13.71,20.83,90.2...|        M|
|[13.0,21.82,87.5,...|        M|
|[12.46,24.04,83.9...|        M|
|[16.02,23.24,102....|        M|
|[15.78,17.89,103....|        M|
|[19.17,24.8,132.4...|        M|
|[15.85,23.95,103....|        M|
|[13.73,22.61,93.6...|        M|
|[14.54,27.54,96.7...|        M|
|[14.68,20.13,94.7...|        M|
|[16.13,20.68,108....|        M|
|[19.81,22.15,130....|        M|
|[13.54,14.36,87.4...|        B|
+--------------------+---------+
only showing top 20 rows



In [11]:
#scaling
from pyspark.ml.feature import StandardScaler

scaler=StandardScaler(inputCol='features',outputCol='scaledFeatures')
scaledDf=scaler.fit(vaDf).transform(vaDf)




In [12]:
#encoding
from pyspark.ml.feature import StringIndexer
sInd=StringIndexer(inputCol='diagnosis',outputCol='label')
df=sInd.fit(scaledDf).transform(scaledDf).select('scaledFeatures','label')
df.show(truncate=False)



+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|scaledFeatures                                                                                                                                                                                                                                                                                                                                                                                                                              

In [30]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(
featuresCol='scaledFeatures',
    predictionCol='prediction',
    probabilityCol='probability',
    rawPredictionCol='rawPrediction',
    maxDepth=5,
    maxBins=32,
    minInstancesPerNode=1,
    minInfoGain=0.0,
    maxMemoryInMB=256,
    cacheNodeIds=False,
    checkpointInterval=1,
    impurity='gini',
    numTrees=10,
    featureSubsetStrategy='auto',
    seed=None,
    subsamplingRate=0.6)

In [31]:
train,test=df.randomSplit([0.8,0.2])

In [34]:
rfModel=rf.fit(train)

In [35]:
predictedDf=rfModel.transform(test)
predictedDf.select('rawPrediction').show()



+------------------+
|     rawPrediction|
+------------------+
| [1.0,6.0,0.0,3.0]|
| [0.0,7.0,0.0,3.0]|
|[0.0,10.0,0.0,0.0]|
|[0.0,10.0,0.0,0.0]|
| [6.0,4.0,0.0,0.0]|
|[10.0,0.0,0.0,0.0]|
| [1.0,7.0,0.0,2.0]|
| [9.0,1.0,0.0,0.0]|
|[10.0,0.0,0.0,0.0]|
|[10.0,0.0,0.0,0.0]|
| [9.0,0.0,1.0,0.0]|
| [7.0,1.0,2.0,0.0]|
| [9.0,0.0,1.0,0.0]|
+------------------+



In [38]:
#-----------------------------------------------------Evaulator of model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# evaluator=BinaryClassificationEvaluator(
#     rawPredictionCol='rawPrediction',
#     labelCol='label',
#     metricName='areaUnderROC',
 
# )
# result=evaluator.evaluate(predictedDf)
# result

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")



In [39]:
result=evaluator.evaluate(predictedDf)

In [40]:
result

0.8461538461538461