In [35]:
%pip install pyspark



In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
spark = SparkSession.builder.getOrCreate()

df_train = spark.read.option("inferSchema", "true").csv("drive/MyDrive/Dataset_python/Classification/BigFootTesting.csv", header=True)
df_test = spark.read.option("inferSchema", "true").csv("drive/MyDrive/Dataset_python/Classification/BigFootTesting.csv", header=True)

df_train.show(5)
df_test.show(5)

+---------+------+------+---------+-----------+
|fur_color|height|weight|eye_color|prehistoric|
+---------+------+------+---------+-----------+
|   calico|182 cm| 69 kg|    brown|      false|
|    white|173 cm|236 kg|    black|       true|
|      red|240 cm|205 kg|     blue|       true|
|   orange|195 cm| 67 kg|    black|      false|
|dark grey|224 cm|289 kg|    black|       true|
+---------+------+------+---------+-----------+
only showing top 5 rows

+---------+------+------+---------+-----------+
|fur_color|height|weight|eye_color|prehistoric|
+---------+------+------+---------+-----------+
|   calico|182 cm| 69 kg|    brown|      false|
|    white|173 cm|236 kg|    black|       true|
|      red|240 cm|205 kg|     blue|       true|
|   orange|195 cm| 67 kg|    black|      false|
|dark grey|224 cm|289 kg|    black|       true|
+---------+------+------+---------+-----------+
only showing top 5 rows



In [39]:
df_train = df_train.select("fur_color", "eye_color", "prehistoric")
df_test = df_test.select("fur_color", "eye_color", "prehistoric")

In [40]:
df_train = df_train.na.drop()
df_test = df_test.na.drop()

In [41]:
df_train = df_train.withColumn("fur_color",
                               when(df_train["fur_color"] == "white", 0).
                               when(df_train["fur_color"] == "grey", 1).
                               when(df_train["fur_color"] == "red", 2).
                               when(df_train["fur_color"] == "orange", 3).
                               when(df_train["fur_color"] == "calico", 4).
                               when(df_train["fur_color"] == "brown", 5).
                               when(df_train["fur_color"] == "black", 6).
                               when(df_train["fur_color"] == "dark grey", 7))

df_train = df_train.withColumn("eye_color", when(df_train["eye_color"] == "green", 0).
                                            when(df_train["eye_color"] == "blue", 1).
                                            when(df_train["eye_color"] == "black", 2).
                                            when(df_train["eye_color"] == "brown", 3))

df_train = df_train.withColumn("prehistoric", when(df_train["prehistoric"] == "true", 0).
                                                  when(df_train["prehistoric"] == "false", 1))

df_test = df_test.withColumn("fur_color",
                             when(df_test["fur_color"] == "white", 0).
                             when(df_test["fur_color"] == "grey", 1).
                             when(df_test["fur_color"] == "red", 2).
                             when(df_test["fur_color"] == "orange", 3).
                             when(df_test["fur_color"] == "calico", 4).
                             when(df_test["fur_color"] == "brown", 5).
                             when(df_test["fur_color"] == "black", 6).
                             when(df_test["fur_color"] == "dark grey", 7))

df_test = df_test.withColumn("eye_color", when(df_test["eye_color"] == "green", 0).
                                            when(df_test["eye_color"] == "blue", 1).
                                            when(df_test["eye_color"] == "black", 2).
                                            when(df_test["eye_color"] == "brown", 3))

df_test = df_test.withColumn("prehistoric", when(df_test["prehistoric"] == "true", 0).
                                                when(df_test["prehistoric"] == "false", 1))

In [42]:
cols = df_train.columns
cols.remove("prehistoric")
df_train = VectorAssembler(inputCols=cols, outputCol="Features").transform(df_train)

scaler = StandardScaler(inputCol="Features", outputCol="Scaled_Features")
df_train = scaler.fit(df_train).transform(df_train)

df_train.select("prehistoric", "Scaled_Features").show(10, False)

+-----------+---------------------------------------+
|prehistoric|Scaled_Features                        |
+-----------+---------------------------------------+
|1          |[1.8902446056433315,2.631407216175239] |
|0          |[0.0,1.754271477450159]                |
|0          |[0.9451223028216658,0.8771357387250796]|
|1          |[1.4176834542324985,1.754271477450159] |
|0          |[3.30792805987583,1.754271477450159]   |
|1          |[0.4725611514108329,0.0]               |
|0          |[3.30792805987583,1.754271477450159]   |
|1          |[2.3628057570541645,2.631407216175239] |
|1          |[2.3628057570541645,0.8771357387250796]|
|1          |[1.4176834542324985,1.754271477450159] |
+-----------+---------------------------------------+
only showing top 10 rows



In [43]:
cols = df_test.columns
cols.remove("prehistoric")
df_test = VectorAssembler(inputCols=cols, outputCol="Features").transform(df_test)

scaler = StandardScaler(inputCol="Features", outputCol="Scaled_Features")
df_test = scaler.fit(df_test).transform(df_test)

df_test.select("prehistoric", "Scaled_Features").show(10, False)

+-----------+---------------------------------------+
|prehistoric|Scaled_Features                        |
+-----------+---------------------------------------+
|1          |[1.8902446056433315,2.631407216175239] |
|0          |[0.0,1.754271477450159]                |
|0          |[0.9451223028216658,0.8771357387250796]|
|1          |[1.4176834542324985,1.754271477450159] |
|0          |[3.30792805987583,1.754271477450159]   |
|1          |[0.4725611514108329,0.0]               |
|0          |[3.30792805987583,1.754271477450159]   |
|1          |[2.3628057570541645,2.631407216175239] |
|1          |[2.3628057570541645,0.8771357387250796]|
|1          |[1.4176834542324985,1.754271477450159] |
+-----------+---------------------------------------+
only showing top 10 rows



In [46]:
#model = LogisticRegression(featuresCol="Scaled_Features", labelCol="prehistoric", maxIter=10).transform(df)

model = LogisticRegression(featuresCol="Scaled_Features", labelCol="prehistoric", maxIter=10)
model = model.fit(df_train)

In [47]:
prediction = model.transform(df_test)

prediction.select("Scaled_features", "prehistoric", "Prediction").show(15, False)

# Accuracy Logistic Regression
evaluator = BinaryClassificationEvaluator(labelCol = "prehistoric")
print("Accuracy: {}%".format(evaluator.evaluate(prediction) * 100))

+---------------------------------------+-----------+----------+
|Scaled_features                        |prehistoric|Prediction|
+---------------------------------------+-----------+----------+
|[1.8902446056433315,2.631407216175239] |1          |0.0       |
|[0.0,1.754271477450159]                |0          |0.0       |
|[0.9451223028216658,0.8771357387250796]|0          |0.0       |
|[1.4176834542324985,1.754271477450159] |1          |0.0       |
|[3.30792805987583,1.754271477450159]   |0          |1.0       |
|[0.4725611514108329,0.0]               |1          |0.0       |
|[3.30792805987583,1.754271477450159]   |0          |1.0       |
|[2.3628057570541645,2.631407216175239] |1          |0.0       |
|[2.3628057570541645,0.8771357387250796]|1          |0.0       |
|[1.4176834542324985,1.754271477450159] |1          |0.0       |
|[1.8902446056433315,0.0]               |1          |0.0       |
|[3.30792805987583,0.8771357387250796]  |1          |1.0       |
|[1.8902446056433315,0.87