In [1]:
# Create Spark Session
from pyspark.sql               import SparkSession
from pyspark                   import SparkContext
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator
from pyspark.ml.feature        import VectorAssembler
from sklearn.metrics           import confusion_matrix
from sklearn.datasets          import load_breast_cancer
from pyspark.sql               import SQLContext
import pandas as pd

# Create a Spark Session
spark1 = SparkSession.builder.master("local[1]").appName("https://mfu.ac.th/").getOrCreate()

# Preparing the data - breast cancer data from Scikit-learn dataset module
bc = load_breast_cancer()
df_bc = pd.DataFrame(bc.data, columns=bc.feature_names)
df_bc['label'] = pd.Series(bc.target)
print(df_bc)

sqlContext = SQLContext(spark1)
data = sqlContext.createDataFrame(df_bc)
print(data.printSchema())

features = bc.feature_names
va = VectorAssembler(inputCols = features, outputCol='features')
va_df = va.transform(data)
va_df = va_df.select(['features', 'label'])
va_df.show(3)
# split data into the train and test parts
(train, test) = va_df.randomSplit([0.9, 0.1])
# Prediction and Accuracy Check
lsvc = LinearSVC(labelCol="label", maxIter=50)
lsvc = lsvc.fit(train)
pred = lsvc.transform(test)
pred.show(5)

# check the prediction accuracy
evaluator=MulticlassClassificationEvaluator(metricName="accuracy")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)

y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

25/02/19 13:56:59 WARN Utils: Your hostname, Jennie-Kims-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.27.15.180 instead (on interface en0)
25/02/19 13:56:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/19 13:57:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/19 13:57:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/02/19 13:57:00 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/02/19 13:57:00 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea



root
 |-- mean radius: double (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst 

25/02/19 13:57:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[17.99,10.38,122....|    0|
|[20.57,17.77,132....|    0|
|[19.69,21.25,130....|    0|
+--------------------+-----+
only showing top 3 rows



25/02/19 13:57:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


CodeCache: size=131072Kb used=30063Kb max_used=30063Kb free=101008Kb
 bounds [0x00000001047d0000, 0x0000000106560000, 0x000000010c7d0000]
 total_blobs=11833 nmethods=10759 adapters=986
 compilation: disabled (not enough contiguous free space left)
+--------------------+-----+--------------------+----------+
|            features|label|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|[6.981,13.43,43.7...|    1|[-12.724038093182...|       1.0|
|[8.196,16.84,51.7...|    1|[-11.300632658538...|       1.0|
|[8.571,13.1,54.53...|    1|[-8.2453471596250...|       1.0|
|[9.295,13.9,59.96...|    1|[-7.2933404710572...|       1.0|
|[9.504,12.44,60.3...|    1|[-10.708527701857...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows

Prediction Accuracy:  0.9558823529411765
Confusion Matrix:
[[21  2]
 [ 1 44]]
