In [0]:
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
df = spark.read.orc('/savetable/train_select.orc')
dfTrain = df.withColumn("pha", col('pha').cast('Integer'))
dfTrain2 = dfTrain.drop('class_binary')

In [0]:
columns_to_scale = dfTrain2.schema.names
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(dfTrain2)
scaledData = scalerModel.transform(dfTrain2)

names = {x + "_scaled": x for x in columns_to_scale}
scaledData = scaledData.select([F.col(c).alias(names[c]) for c in names.keys()])

In [0]:
firstelement=F.udf(lambda v:float(v[0]),DoubleType())
scaledData2 = scaledData.select([firstelement(c).alias(c) for c in scaledData.columns])

(trainingData, validationData) = scaledData2.randomSplit([0.7, 0.3], seed = 100)

trainingData.cache()
validationData.cache()

In [0]:
trainingData = trainingData.withColumnRenamed("pha","label")
validationData = validationData.withColumnRenamed("pha","label")

In [0]:
dfTrain3 = trainingData
dfTrain3 = dfTrain3.withColumn("label", col('label').cast('Integer'))

#labelIndexer = StringIndexer(inputCol="pha", outputCol="label")

columnNames = dfTrain3.schema.names
columnNames.remove('label')
vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")

nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

pipeline = Pipeline(stages=[vecAssembler, nb])

model = pipeline.fit(dfTrain3)

In [0]:
predictions = model.transform(validationData)
display(predictions.select("label", "prediction", "probability"))

label,prediction,probability
0.0,0.0,"List(1, 2, List(), List(0.9999017628335346, 9.823716646536148E-5))"
0.0,0.0,"List(1, 2, List(), List(0.9998946117136336, 1.0538828636630964E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9998942232781372, 1.0577672186286518E-4))"
0.0,0.0,"List(1, 2, List(), List(0.999889751840509, 1.1024815949092868E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9999248504050456, 7.514959495442395E-5))"
0.0,0.0,"List(1, 2, List(), List(0.9998446528538144, 1.5534714618560219E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9998621812159019, 1.378187840982242E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9998968632369687, 1.0313676303142196E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9998669425733244, 1.3305742667560753E-4))"
0.0,0.0,"List(1, 2, List(), List(0.999702478431807, 2.9752156819299463E-4))"


In [0]:
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = BinaryClassificationMetrics(predictionAndLabels)
metricsMC = MulticlassMetrics(predictionAndLabels)

In [0]:
print("F1 score for pha = no :", metricsMC.fMeasure(label=0.0))
print("F1 score for pha = yes :", metricsMC.fMeasure(label=1.0))
print("Area under ROC =", metrics.areaUnderPR)

In [0]:
#print((df.count(), len(df.columns)))
#your_max_value = df.agg({"your-column": "max"}).collect()[0][0]
#trainingData.select(trainingData.columns[:5]).show(50)
#df.select('zip_code').collect()

In [0]:
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]).build()
cvEvaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC')
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=cvEvaluator, numFolds=25)
cvModel = cv.fit(trainingData)
cvPredictions = cvModel.transform(validationData)
cvEvaluator.evaluate(cvPredictions)

In [0]:
ROCscore = cvEvaluator.evaluate(cvPredictions)
print("Area under ROC =", ROCscore)

In [0]:
dfTest = spark.read.orc('/savetable/test_clean.orc')
dfTest = dfTest.withColumn("pha", col('pha').cast('Integer'))
dfTest2 = dfTest.select('neo', 'pha', 'H', 'diameter', 'albedo', 'diameter_sigma', 'epoch', 'e', 'i', 'om', 'w', 'ma', 'n', 'per', 'moid_ld', 'sigma_om', 'sigma_n', 'rms', 'pseudo_target')

In [0]:
columns_to_scale = dfTest2.schema.names
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(dfTest2)
scaledData = scalerModel.transform(dfTest2)

names = {x + "_scaled": x for x in columns_to_scale}
scaledData = scaledData.select([F.col(c).alias(names[c]) for c in names.keys()])

firstelement=F.udf(lambda v:float(v[0]),DoubleType())
dfTest2 = scaledData.select([firstelement(c).alias(c) for c in scaledData.columns])

dfTest2 = dfTest2.withColumnRenamed("pha","label")

In [0]:
predictions = model.transform(dfTest2)
display(predictions.select("label", "prediction", "probability"))

label,prediction,probability
0.0,0.0,"List(1, 2, List(), List(0.9995953677564227, 4.0463224357736E-4))"
0.0,0.0,"List(1, 2, List(), List(0.999444603509982, 5.553964900179986E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9997445391742388, 2.5546082576123245E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9995484346070171, 4.5156539298287147E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9994770655163154, 5.229344836846184E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9994635922651732, 5.364077348266439E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9995784893301931, 4.215106698069625E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9996095156625241, 3.9048433747580535E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9996617997269124, 3.3820027308769323E-4))"
0.0,0.0,"List(1, 2, List(), List(0.9996162292338145, 3.8377076618542237E-4))"


In [0]:
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = BinaryClassificationMetrics(predictionAndLabels)
metricsMC = MulticlassMetrics(predictionAndLabels)

In [0]:
print("F1 score for pha = no :", metricsMC.fMeasure(label=0.0))
print("F1 score for pha = yes :", metricsMC.fMeasure(label=1.0))
print("Area under ROC =", metrics.areaUnderPR)