# Training with non cv

In [None]:
trainDF = spark.read.parquet('/data/user/hive/warehouse/ian/feature/trainDF_hw')
testDF = spark.read.parquet('/data/user/hive/warehouse/ian/feature/testDF_hw')

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier(labelCol='label',featuresCol='features',numTrees=30)
model = rf.fit(trainDF)
predictions = model.transform(testDF)
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

# Training with CV & hp tuning Random Forest

In [1]:
trainDF = spark.read.parquet('/data/user/hive/warehouse/ian/feature/trainDF_hw')
testDF = spark.read.parquet('/data/user/hive/warehouse/ian/feature/testDF_hw')

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
import numpy as np

rf = RandomForestClassifier()

paramGrid_rf = ParamGridBuilder()\
       .addGrid(rf.maxDepth,[3,4,5,6,7,8,9,10])\
       .addGrid(rf.numTrees,[10,15,20,25,30])\
       .build()

crossval_rf = CrossValidator(estimator=rf,
                            estimatorParamMaps=paramGrid_rf,
                            evaluator=MulticlassClassificationEvaluator(),
                            numFolds=5)

cvModel_rf = crossval_rf.fit(trainDF)

best_model_rf = cvModel_rf.bestModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
my_eval_rf = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
my_eval_rf.evaluate(best_model_rf.transform(testDF))

In [4]:
#best_model_rf.save('hdfs:///data/user/hive/warehouse/ian/model/mac_id_12_rf_cv_tuning_model')

In [4]:
result = best_model_rf.transform(testDF)

In [11]:
mapping_table = result.select('id','label').distinct()

In [10]:
#mapping_table.write.mode('overwrite').parquet('hdfs:///data/user/hive/warehouse/ian/feature/mapping_table',compression='gzip')

In [12]:
mapping_table = mapping_table.withColumnRenamed('id','predict_id')

In [13]:
result = result.select('mac','prediction')
final_result = result.join(mapping_table,result.prediction == mapping_table.label)

In [14]:
final_result = final_result.select('mac','predict_id').distinct()

In [17]:
final_result.sample(0.0001).show(truncate=False)

+------------+----------+
|mac         |predict_id|
+------------+----------+
|8C8590DA1CCB|49813     |
|DCA4CA216A52|49814     |
|00B362D13FDD|49809     |
|38CADA970EF3|49809     |
|B844D9DBFB9F|49808     |
|5433CB5B8F9E|39226     |
|649ABE637666|49807     |
|7014A6CA6D7C|49807     |
|D88F76C7CFEF|39226     |
|24F6773B8DBB|49814     |
|844167A3A34C|49807     |
|40261961E073|39147     |
|6C4D73481425|39147     |
|483B3895CD20|49809     |
|50A67FA7A883|49880     |
|68EF43BF12BA|39147     |
|48A195707F1E|49814     |
|DC2B2A2A969A|49809     |
|40831DA2E1CE|49813     |
|C0CCF84C4B98|49810     |
+------------+----------+
only showing top 20 rows



In [104]:
testDF.filter(testDF.mac == '8C8590DA1CCB').show()

+------------+-----+---+----+---+---+---+---+----+----+---+----+----+----+--------------------+-----+
|         mac|   id| f1|  f2| f3| f4| f5| f6|  f7|  f8| f9| f10| f11| f12|            features|label|
+------------+-----+---+----+---+---+---+---+----+----+---+----+----+----+--------------------+-----+
|8C8590DA1CCB|49814|8.0|12.0|8.0|5.0|9.0|0.0|13.0|10.0|1.0|12.0|12.0|11.0|[8.0,12.0,8.0,5.0...|  2.0|
+------------+-----+---+----+---+---+---+---+----+----+---+----+----+----+--------------------+-----+



# crontab mac_id prediction

In [37]:
from pyspark.ml.classification import RandomForestClassificationModel
model = RandomForestClassificationModel.load('/data/user/hive/warehouse/ian/model/mac_id_12_rf_cv_tuning_model')
testDF = spark.read.parquet('/data/user/hive/warehouse/ian/feature/testDF_mac_12') #change PATH
result = model.transform(testDF)
mapping_table = spark.read.parquet('/data/user/hive/warehouse/ian/feature/mapping_table').withColumnRenamed('id','predict_id')
result = result.select('mac','prediction')
final_result = result.join(mapping_table,result.prediction == mapping_table.label)
final_result = final_result.select('mac','predict_id').distinct()
final_result.sample(0.001).show(truncate=False)

+------------+----------+
|mac         |predict_id|
+------------+----------+
|9CE33F6AF89B|39226     |
|84A134385E87|49810     |
|5433CB525C01|39226     |
|70F087774375|49814     |
|68DBCA53E6A9|49810     |
|84A134093602|49810     |
|C83C85E5A5F1|39226     |
|404D7FE74BEE|49813     |
|E0ACCB434FF3|49880     |
|7C04D056B17E|49814     |
|24F67750A0A7|49814     |
|38C98692F03A|49880     |
|94F6D6723318|39147     |
|90B0EDB39F67|49810     |
|68AB1E19EAB0|49880     |
|9CE33F951EEA|39226     |
|7C50495799BF|49814     |
|DCA4CAD4D097|49814     |
|70EF00427A61|39147     |
|DC0C5C8D8A23|49814     |
+------------+----------+
only showing top 20 rows



In [38]:
final_result.groupBy('predict_id').count().show(truncate=False)

+----------+------+
|predict_id|count |
+----------+------+
|49810     |332712|
|39890     |65343 |
|49809     |320635|
|49866     |60898 |
|49880     |84506 |
|39147     |225601|
|49807     |415662|
|39000     |24245 |
|49814     |402567|
|49813     |209372|
|39226     |425120|
|10679     |25368 |
|10950     |139825|
|10680     |126929|
|49806     |74259 |
|49808     |219770|
+----------+------+

