In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark=SparkSession.builder.getOrCreate()

In [3]:
#Load dataset
train_data = spark.read.option("inferSchema","true").csv("Classification/Planet_Training.csv",header=True)
#drop NA
train_data = train_data.select("Temperature","Water","Atmosphere Color","Habitable")
train_data = train_data.na.drop()

In [4]:
#load dataset
test_data = spark.read.option("inferSchema","true").csv("Classification/Planet_Testing.csv",header=True)
#drop NA
test_data  = test_data.select("Temperature","Water","Atmosphere Color","Habitable")
test_data = test_data.na.drop()

In [5]:
train_data.show()

+-----------+------+----------------+---------+
|Temperature| Water|Atmosphere Color|Habitable|
+-----------+------+----------------+---------+
|     323488|Medium|          Yellow|        1|
|     319279|   Low|          Yellow|        1|
|     315375|   Low|          Yellow|        1|
|     302312|Medium|          Yellow|        1|
|     329687|   Low|          Yellow|        1|
|     265746|  High|             Red|        0|
|     305214|  High|          Yellow|        1|
|     299936|  High|          Yellow|        0|
|     269577|Medium|             Red|        1|
|     303631|  High|             Red|        0|
|     290051|  High|             Red|        0|
|     306122|   Low|          Yellow|        1|
|     300635|   Low|          Yellow|        1|
|     312152|  High|            Blue|        0|
|     265942|Medium|            Blue|        0|
|     307368|  High|             Red|        0|
|     276274|Medium|          Yellow|        1|
|     308531|Medium|          Yellow|   

In [6]:
def transform_data(data):
    data = data.withColumn("Water", when(data["Water"]=="High",2).when(data["Water"]=="Medium",1).otherwise(0))
    data = data.withColumn("Atmosphere Color", when(data["Atmosphere Color"]=="Red",0).when(data["Atmosphere Color"]=="Blue",1).otherwise(2))
    
    cols = data.columns
    cols.remove("Habitable")
    
    data = VectorAssembler(inputCols = cols,outputCol = "Features").transform(data)
    

    scaler = StandardScaler(inputCol = "Features",outputCol = "Scaled_Features")
    data = scaler.fit(data).transform(data)
    
    return data

In [7]:
test_data = transform_data(test_data)
test_data.show()

+-----------+-----+----------------+---------+------------------+--------------------+
|Temperature|Water|Atmosphere Color|Habitable|          Features|     Scaled_Features|
+-----------+-----+----------------+---------+------------------+--------------------+
|     325145|    2|               2|        1|[325145.0,2.0,2.0]|[16.6313339613367...|
|     269079|    1|               0|        0|[269079.0,1.0,0.0]|[13.7635292284443...|
|     302996|    2|               2|        1|[302996.0,2.0,2.0]|[15.4984012208374...|
|     312604|    2|               2|        1|[312604.0,2.0,2.0]|[15.9898553619146...|
|     280875|    0|               2|        1|[280875.0,0.0,2.0]|[14.3669006947376...|
|     306384|    2|               0|        0|[306384.0,2.0,0.0]|[15.6716991631740...|
|     303007|    2|               2|        1|[303007.0,2.0,2.0]|[15.4989638764944...|
|     297965|    2|               0|        0|[297965.0,2.0,0.0]|[15.2410629835603...|
|     290305|    2|               1|       

In [10]:
train_data=transform_data(train_data)

In [11]:
model = LogisticRegression(featuresCol = "Scaled_Features", labelCol = "Habitable", maxIter=10).fit(train_data)

prediction = model.transform(test_data)

In [12]:
eval = BinaryClassificationEvaluator(labelCol="Habitable")

acc=eval.evaluate(prediction)
print("Accuracy",acc*100)

Accuracy 91.71043337232418
