In [82]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import lit

In [83]:
spark = SparkSession.builder.appName("Python Spark K-means").getOrCreate()

In [84]:
df = spark.read.csv("datasets\iris.csv", header = True, inferSchema = True, nullValue = 'NA')

In [None]:
df.show(5,True)

In [None]:
df.printSchema()

In [None]:
df.columns

In [None]:
df.printSchema()

In [None]:
df = df.withColumn("label", lit(((df[0] > 4) & (df[1] > 3)).cast('integer')))
df.show(5)

In [108]:
assemble=VectorAssembler(inputCols=[
 'sepal_length',
 'sepal_width',
 'petal_length',
 'petal_width'], outputCol='features')

In [109]:
assembled_data=assemble.transform(df)

In [None]:
assembled_data.show(5)

In [None]:
assembled_data = assembled_data.drop('sepal_length','sepal_width','petal_length','petal_width','Species')
assembled_data.show(5)

In [112]:
(train, test) = assembled_data.randomSplit([0.8, 0.2])

In [None]:
rfc = RandomForestClassifier(featuresCol="features", labelCol="label")
rfc = rfc.fit(train)
pred = rfc.transform(test)
pred.show(20)

In [None]:
evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)

y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)