In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression

In [4]:
spark = SparkSession.builder.appName("LR_Solution").getOrCreate()

21/08/01 14:36:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/08/01 14:36:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
spark

In [6]:
zoo_df = spark.read.csv("zoo.csv", inferSchema=True, header=True)

In [7]:
zoo_df.show(5)

+----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----+
|AnimalName|Hair|Feathers|Eggs|Milk|Airborne|Aquatic|Predator|Toothed|Backbone|Breathes|Venomous|Fins|Legs|Tail|Domestic|Catsize|Type|
+----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----+
|  aardvark|   1|       0|   0|   1|       0|      0|       1|      1|       1|       1|       0|   0|   4|   0|       0|      1|   1|
|  antelope|   1|       0|   0|   1|       0|      0|       0|      1|       1|       1|       0|   0|   4|   1|       0|      1|   1|
|      bass|   0|       0|   1|   0|       0|      1|       1|      1|       1|       0|       0|   1|   0|   1|       0|      0|   4|
|      bear|   1|       0|   0|   1|       0|      0|       1|      1|       1|       1|       0|   0|   4|   0|       0|      1|   1|
|      boar|   1|       0|   0|   1|       0|      0|  

In [8]:
zoo_df = zoo_df.withColumn("IsMamal", expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END"))

In [9]:
zoo_df.show(5)

+----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----+-------+
|AnimalName|Hair|Feathers|Eggs|Milk|Airborne|Aquatic|Predator|Toothed|Backbone|Breathes|Venomous|Fins|Legs|Tail|Domestic|Catsize|Type|IsMamal|
+----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----+-------+
|  aardvark|   1|       0|   0|   1|       0|      0|       1|      1|       1|       1|       0|   0|   4|   0|       0|      1|   1|      1|
|  antelope|   1|       0|   0|   1|       0|      0|       0|      1|       1|       1|       0|   0|   4|   1|       0|      1|   1|      1|
|      bass|   0|       0|   1|   0|       0|      1|       1|      1|       1|       0|       0|   1|   0|   1|       0|      0|   4|      0|
|      bear|   1|       0|   0|   1|       0|      0|       1|      1|       1|       1|       0|   0|   4|   0|       0|      1|   1|      1|

In [30]:
features_vector = RFormula(formula="IsMamal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic + Predator + Toothed + Backbone + Breathes + Venomous + Fins + Legs + Tail + Domestic + Catsize")
# features_vector = RFormula(formula="IsMamal ~ .)

In [31]:
preprocessed_data = features_vector.fit(zoo_df).transform(zoo_df)

In [32]:
preprocessed_data.select("features", "label").show(5, truncate=False)

+----------------------------------------------------------------------------------------------+-----+
|features                                                                                      |label|
+----------------------------------------------------------------------------------------------+-----+
|(116,[1,99,102,105,106,107,108,111,114,115],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0])        |1.0  |
|(116,[2,99,102,106,107,108,111,112,114,115],[1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0])        |1.0  |
|(116,[3,101,104,105,106,107,110,112,115],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0])               |0.0  |
|(116,[4,99,102,105,106,107,108,111,114,115],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0])        |1.0  |
|(116,[5,99,102,105,106,107,108,111,112,114,115],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0])|1.0  |
+----------------------------------------------------------------------------------------------+-----+
only showing top 5 rows



In [33]:
train_data, test_data = preprocessed_data.randomSplit([0.7, 0.3])

In [34]:
lr = LogisticRegression(labelCol="label", featuresCol="features")

In [35]:
fitted_lr = lr.fit(train_data)

In [36]:
result = fitted_lr.transform(preprocessed_data)

In [37]:
result.select("AnimalName", "label", "prediction").filter("AnimalName IN ('goat', 'hamster', 'clam', 'crayfish')").show(100)

+----------+-----+----------+
|AnimalName|label|prediction|
+----------+-----+----------+
|      clam|  0.0|       0.0|
|  crayfish|  0.0|       0.0|
|      goat|  1.0|       1.0|
|   hamster|  1.0|       1.0|
+----------+-----+----------+



In [41]:
truePositive = result.filter("prediction = 1.0 AND label = 1.0").count()
print("True Postive: " + str(truePositive))

falsePositive = result.filter("prediction = 1.0 AND label = 0.0").count()
print("False Postive: " + str(falsePositive))

print("")

trueNegative = result.filter("prediction = 0.0 AND label = 0.0").count()
print("True Negative: " + str(trueNegative))

falseNegative = result.filter("prediction = 0.0 AND label = 1.0").count()
print("False Negative: " + str(falseNegative))

True Postive: 41
False Postive: 0

True Negative: 60
False Negative: 0


In [42]:
# from pyspark.ml.linalg import Vectors
# from pyspark.ml.feature import VectorAssembler

# dataset = spark.createDataFrame(
#     [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
#     ["id", "hour", "mobile", "userFeatures", "clicked"])

# assembler = VectorAssembler(
#     inputCols=["hour", "mobile", "userFeatures"],
#     outputCol="features")

# output = assembler.transform(dataset)
# print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
# output.select("features", "clicked").show(truncate=False)