In [1]:
pip install pyspark --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer,StandardScaler,VectorAssembler,VectorIndexer
from pyspark.ml.linalg import DenseVector
import sklearn
import gc
from sklearn.model_selection import train_test_split


In [5]:
spark=(SparkSession.builder
       .appName('Apache Spark Tutorial')
       .config('spark.executor.memory','1G')
       .config('spark.executor.cores','4')
       .getOrCreate())
spark

In [6]:
spark.sparkContext.setLogLevel('Info')

In [7]:
spark.version

'3.5.0'

In [8]:
url=('/content/drive/MyDrive/iris.xls')
data=spark.read.format('csv')\
      .option('header','true')\
      .option('inferSchema','true')\
      .load(url)
data.cache()

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]

In [9]:
data.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [10]:
data.head(5)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='setosa'),
 Row(sepal_length=4.9, sepal_width=3.0, petal_length=1.4, petal_width=0.2, species='setosa'),
 Row(sepal_length=4.7, sepal_width=3.2, petal_length=1.3, petal_width=0.2, species='setosa'),
 Row(sepal_length=4.6, sepal_width=3.1, petal_length=1.5, petal_width=0.2, species='setosa'),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, species='setosa')]

In [11]:
data.count()

150

In [12]:
data.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [13]:
data.groupBy('species').count().show()

+----------+-----+
|   species|count|
+----------+-----+
| virginica|   50|
|versicolor|   50|
|    setosa|   50|
+----------+-----+



In [14]:
data.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|     NULL|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|     NULL|
|    min|               4.3|                2.0|               1.0|               0.1|   setosa|
|    max|               7.9|                4.4|               6.9|               2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



In [15]:
SIndexer=StringIndexer(inputCol='species',outputCol='species index')
data=SIndexer.fit(data).transform(data)
data.show(5)

+------------+-----------+------------+-----------+-------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|species|species index|
+------------+-----------+------------+-----------+-------+-------------+
|         5.1|        3.5|         1.4|        0.2| setosa|          0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|          0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|          0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|          0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|          0.0|
+------------+-----------+------------+-----------+-------+-------------+
only showing top 5 rows



In [16]:
data1=data.select('species index','sepal_length','sepal_width','petal_length','petal_width')
data1.show(5)

+-------------+------------+-----------+------------+-----------+
|species index|sepal_length|sepal_width|petal_length|petal_width|
+-------------+------------+-----------+------------+-----------+
|          0.0|         5.1|        3.5|         1.4|        0.2|
|          0.0|         4.9|        3.0|         1.4|        0.2|
|          0.0|         4.7|        3.2|         1.3|        0.2|
|          0.0|         4.6|        3.1|         1.5|        0.2|
|          0.0|         5.0|        3.6|         1.4|        0.2|
+-------------+------------+-----------+------------+-----------+
only showing top 5 rows



In [17]:
input_data=data1.rdd.map(lambda x: (x[0],DenseVector(x[1:])))
input_data

PythonRDD[73] at RDD at PythonRDD.scala:53

In [18]:
data1_index=spark.createDataFrame(input_data,['label','features'])
data1_index.show(5)

+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[5.1,3.5,1.4,0.2]|
|  0.0|[4.9,3.0,1.4,0.2]|
|  0.0|[4.7,3.2,1.3,0.2]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[5.0,3.6,1.4,0.2]|
+-----+-----------------+
only showing top 5 rows



In [19]:
stdScaler=StandardScaler(inputCol='features',outputCol='featured_scaled')
scaler=stdScaler.fit(data1_index)
data1_scaled=scaler.transform(data1_index)
data1_scaled

DataFrame[label: double, features: vector, featured_scaled: vector]

In [20]:
data1_scaled=data1_scaled.drop('features')
data1_scaled

DataFrame[label: double, featured_scaled: vector]

In [21]:
# Spliting Data For Train and Test

In [22]:
train_data,test_data=data1_scaled.randomSplit([0.9,0.1],seed=12345)

In [23]:
train_data.show(5)

+-----+--------------------+
|label|     featured_scaled|
+-----+--------------------+
|  0.0|[5.19282199176603...|
|  0.0|[5.31358529390013...|
|  0.0|[5.31358529390013...|
|  0.0|[5.31358529390013...|
|  0.0|[5.43434859603422...|
+-----+--------------------+
only showing top 5 rows



In [24]:
test_data.show(5)

+-----+--------------------+
|label|     featured_scaled|
+-----+--------------------+
|  0.0|[6.03816510670469...|
|  0.0|[6.03816510670469...|
|  0.0|[6.27969171097288...|
|  0.0|[6.88350822164335...|
|  1.0|[5.9174018045706,...|
+-----+--------------------+
only showing top 5 rows



In [25]:
model=['DecisionTree','RandomForest','Naive Bayes']
model_results=[]

In [26]:
dtc=DecisionTreeClassifier(labelCol='label',featuresCol='featured_scaled')
dtcmodel=dtc.fit(train_data)
dtcpred=dtcmodel.transform(test_data)
evaluator=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
dtcacc=evaluator.evaluate(dtcpred)
model_results.extend([[model[0],'{:.2%}'.format(dtcacc)]])
model_results

[['DecisionTree', '90.91%']]

In [27]:
rfc=RandomForestClassifier(labelCol='label',featuresCol='featured_scaled',numTrees=10)
rfcmodel=rfc.fit(train_data)
rfcpred=rfcmodel.transform(test_data)
evaluator=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
rfcacc=evaluator.evaluate(rfcpred)
model_results.extend([[model[1],'{:.2%}'.format(rfcacc)]])
model_results

[['DecisionTree', '90.91%'], ['RandomForest', '90.91%']]

In [None]:
nb=NaiveBayes(labelCol='label',featuresCol='featured_scaled',smoothing=1.0,modelType='multinomial')
nbmodel=nb.fit(train_data)
nbpred=nbmodel.transform(test_data)
evaluator=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
nbacc=evaluator.evaluate(nbpred)
model_results.extend([[model[2],'{:2%}'.format(nbacc)]])
model_results

In [28]:
gc.collect()

145