# This is my first attempt to create a logistic regression model using PySpark ☺️

In [104]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,mean,mode,round,isnan
import pandas as pd
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [102]:
rdd0 = spark.read.csv('/content/water_potability.csv',header=True,inferSchema=True)

In [108]:
rdd0.printSchema()

root
 |-- ph: double (nullable = true)
 |-- Hardness: double (nullable = true)
 |-- Solids: double (nullable = true)
 |-- Chloramines: double (nullable = true)
 |-- Sulfate: double (nullable = true)
 |-- Conductivity: double (nullable = true)
 |-- Organic_carbon: double (nullable = true)
 |-- Trihalomethanes: double (nullable = true)
 |-- Turbidity: double (nullable = true)
 |-- Potability: integer (nullable = true)



In [107]:
for i in rdd0.columns:
  print(f"Column {i}: {(rdd0.filter(col(i).isNull())).count()}")

Column ph: 491
Column Hardness: 0
Column Solids: 0
Column Chloramines: 0
Column Sulfate: 781
Column Conductivity: 0
Column Organic_carbon: 0
Column Trihalomethanes: 162
Column Turbidity: 0
Column Potability: 0


In [119]:
ph = rdd0.na.drop(subset=['ph']).select(mean(col('ph'))).collect()[0][0]
sulfate = rdd0.na.drop(subset=['Sulfate']).select(mean(col('Sulfate'))).collect()[0][0]
trihalomethanes = rdd0.na.drop(subset=['Trihalomethanes']).select(mean(col('Trihalomethanes'))).collect()[0][0]

In [120]:
rdd1 = rdd0.na.fill({'ph':ph,'Sulfate':sulfate,'Trihalomethanes':trihalomethanes})

In [125]:
train, test = rdd1.randomSplit([0.8,0.2],seed=42)

In [126]:
from pyspark.ml.feature import VectorAssembler
vas = VectorAssembler(inputCols=rdd1.columns[:-1],outputCol='features')

In [128]:
train_vas = vas.transform(train)

In [129]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features',labelCol='Potability',predictionCol='prediction')
model = lr.fit(train_vas)

In [130]:
test_vas = vas.transform(test)

In [131]:
pred = model.transform(test_vas)

In [139]:
pred.select('Potability','prediction').show()

+----------+----------+
|Potability|prediction|
+----------+----------+
|         0|       0.0|
|         0|       0.0|
|         0|       0.0|
|         0|       0.0|
|         1|       0.0|
|         0|       0.0|
|         0|       0.0|
|         1|       0.0|
|         0|       0.0|
|         1|       0.0|
|         0|       0.0|
|         0|       0.0|
|         0|       0.0|
|         0|       0.0|
|         0|       0.0|
|         0|       0.0|
|         1|       0.0|
|         0|       0.0|
|         0|       0.0|
|         0|       0.0|
+----------+----------+
only showing top 20 rows

