# grid_search & cross_validation

In [10]:
from pyspark.sql.functions import *

d1 = spark.read.parquet('/data/user/hive/warehouse/ian/feature/unrecognized/*')
d2 = spark.read.parquet('/data/user/hive/warehouse/ian/feature/recognized/*')
d2 = d2.sample(0.20,seed=123)

df = d1.union(d2)

from pyspark.ml.feature import MinMaxScaler,StandardScaler,VectorAssembler,StringIndexer

vec = VectorAssembler(inputCols=['f1','f2','f3','f4','f5','f6','f7'],outputCol='features')
vec_ = vec.transform(df)

In [14]:
trainDF, testDF = vec_.randomSplit([0.8,0.2],seed=0)

In [25]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
import numpy as np

rf = RandomForestClassifier()

paramGrid_rf = ParamGridBuilder()\
       .addGrid(rf.maxDepth,[3,4,5,6,7,8,9,10])\
       .addGrid(rf.numTrees,[10,15,20,25,30])\
       .build()

crossval_rf = CrossValidator(estimator=rf,
                            estimatorParamMaps=paramGrid_rf,
                            evaluator=BinaryClassificationEvaluator(),
                            numFolds=5)

cvModel_rf = crossval_rf.fit(trainDF)

AttributeError: 'RandomForestClassificationModel' object has no attribute 'summary'

In [32]:
best_model_rf = cvModel_rf.bestModel

In [41]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval_rf = BinaryClassificationEvaluator(
    rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
my_eval_rf.evaluate(best_model_rf.transform(testDF))

0.9450373479023623

# Prediction

In [65]:
from pyspark.sql.functions import *
d = spark.read.csv('/user/maxnet/database/sig.db/data_visual_unknown/*',sep='\x01')
d1 = d.select('_c2').withColumnRenamed('_c2','val').distinct().dropna()

d1 = d1.withColumn('f1',length(col('val')))

d1 = d1.withColumn('f2',when(d1.val.startswith('A')|d1.val.startswith('B')|d1.val.startswith('C')\
                             |d1.val.startswith('D')|d1.val.startswith('E')|d1.val.startswith('F')\
                             |d1.val.startswith('G')|d1.val.startswith('H')|d1.val.startswith('I')\
                             |d1.val.startswith('J')|d1.val.startswith('K')|d1.val.startswith('L')\
                             |d1.val.startswith('M')|d1.val.startswith('N')|d1.val.startswith('O')\
                             |d1.val.startswith('P')|d1.val.startswith('Q')|d1.val.startswith('R')\
                             |d1.val.startswith('S')|d1.val.startswith('T')|d1.val.startswith('U')\
                             |d1.val.startswith('V')|d1.val.startswith('W')|d1.val.startswith('X')\
                             |d1.val.startswith('Y')|d1.val.startswith('Z'),1).otherwise(0))

import re

num_regex = re.compile(r'[0-9]') #数字
xiaoxiezimu_regex = re.compile(r'[a-z]')#小写字母
daxiezimu_regex = re.compile(r'[A-Z]') #大写字母 
#hanzi_regex = re.compile(r'[\u4E00-\u9FA5]')#汉字

from pyspark.sql.functions import udf
num = udf(lambda x: len(num_regex.findall(x)))
xiaoxie = udf(lambda x: len(xiaoxiezimu_regex.findall(x)))
daxie = udf(lambda x: len(daxiezimu_regex.findall(x)))

d1 = d1.withColumn('f3',num('val'))
d1 = d1.withColumn('f4',xiaoxie('val'))
d1 = d1.withColumn('f5',daxie('val'))

# 特征字符串长度 f1
# 首字母是否大写 f2
# 数字字符数量   f3
# 小写字母数量   f4
# 大写字母数量   f5
# 特殊符号-_:的数量 f6
# 空格的数量 f7


# 统计下划线个数
def xiahuaxian_count(s):
    xiahuaxian_counts=0
    for c in s:
        xiahuaxian_split_list = c.split('_')
        xiahuaxian_counts += len(xiahuaxian_split_list) - 1
    return xiahuaxian_counts

# 统计中划线个数
def zhonghuaxian_count(s):
    zhonghuaxian_counts=0
    for c in s:
        zhonghuaxian_split_list = c.split('-')
        zhonghuaxian_counts += len(zhonghuaxian_split_list) - 1
    return zhonghuaxian_counts

# 统计冒号个数
def maohao_count(s):
    maohao_counts=0
    for c in s:
        maohao_split_list = c.split(':')
        maohao_counts += len(maohao_split_list) - 1
    return maohao_counts

def teshu_count(s):
    teshu_counts=0
    a_counts=0
    b_counts=0
    c_counts=0
    for c in s:
        a_split_list = c.split('_')
        a_counts += len(a_split_list) - 1
        
        b_split_list = c.split('-')
        b_counts += len(b_split_list) - 1
        
        c_split_list = c.split(':')
        c_counts += len(c_split_list) - 1
        
        teshu_counts = a_counts + b_counts + c_counts
    return teshu_counts
        

# 统计空格个数
def space_count(s):
    space_counts=0
    for c in s:
        space_split_list = c.split(' ')
        space_counts += len(space_split_list) - 1
    return space_counts

teshu = udf(lambda x: teshu_count(x))
kongge = udf(lambda x: space_count(x))


d1 = d1.withColumn('f6',teshu('val'))
d1 = d1.withColumn('f7',kongge('val'))


d1 = d1.select('val',col('f1').cast('float'),col('f2').cast('float'),col('f3').cast('float'),col('f4').cast('float'),col('f5').cast('float'),col('f6').cast('float'),col('f7').cast('float'))

d1.show(truncate=False)

+-------------------------------+----+---+----+----+----+---+---+
|val                            |f1  |f2 |f3  |f4  |f5  |f6 |f7 |
+-------------------------------+----+---+----+----+----+---+---+
|uplus-haier-0601-29a4-v5-sapkz |30.0|0.0|8.0 |17.0|0.0 |5.0|0.0|
|BRW0C96E67F9881                |15.0|1.0|9.0 |0.0 |6.0 |0.0|0.0|
|05af9271958ff346b26a47b3a5ef7a7|31.0|0.0|20.0|11.0|0.0 |0.0|0.0|
|054297ed1620ee2506981fcc2aee7da|31.0|0.0|19.0|12.0|0.0 |0.0|0.0|
|ZTE24:58:6e:84:98:38           |20.0|1.0|11.0|1.0 |3.0 |5.0|0.0|
|05bfa2a9116d3f3c04535b4cedff9e4|31.0|0.0|17.0|14.0|0.0 |0.0|0.0|
|GM-T9+                         |6.0 |1.0|1.0 |0.0 |3.0 |1.0|0.0|
|N6MK2                          |5.0 |1.0|2.0 |0.0 |3.0 |0.0|0.0|
|E55CA-TC3551A-M                |15.0|1.0|6.0 |0.0 |7.0 |2.0|0.0|
|0ae6c6f54f4a69c2acfe3588223eb19|31.0|0.0|18.0|13.0|0.0 |0.0|0.0|
|ZTE8c:68:c8:b1:d2:78           |20.0|1.0|8.0 |4.0 |3.0 |5.0|0.0|
|ZTE8c:68:c8:bf:94:78           |20.0|1.0|8.0 |4.0 |3.0 |5.0|0.0|
|AUTOBVT-I

In [66]:
d1.count()

5804

In [67]:
d1.dtypes

[('val', 'string'),
 ('f1', 'float'),
 ('f2', 'float'),
 ('f3', 'float'),
 ('f4', 'float'),
 ('f5', 'float'),
 ('f6', 'float'),
 ('f7', 'float')]

In [68]:
from pyspark.ml.feature import MinMaxScaler,StandardScaler,VectorAssembler,StringIndexer

unknow = VectorAssembler(inputCols=['f1','f2','f3','f4','f5','f6','f7'],outputCol='features')
unknow = vec.transform(d1)

In [69]:
t = cvModel_rf.transform(unknow)

In [77]:
t.dtypes

[('val', 'string'),
 ('f1', 'float'),
 ('f2', 'float'),
 ('f3', 'float'),
 ('f4', 'float'),
 ('f5', 'float'),
 ('f6', 'float'),
 ('f7', 'float'),
 ('features', 'vector'),
 ('rawPrediction', 'vector'),
 ('probability', 'vector'),
 ('prediction', 'double')]

In [72]:
from pyspark.sql.types import DoubleType
unlist = udf(lambda x: float(list(x)[1]), DoubleType())

In [73]:
pred = t.select('val',unlist('probability').alias('probability'),'prediction').filter(t.prediction == 1).filter(t.val != 'unknown').filter(t.val != 'empty').filter(t.val != 'NONE').filter(t.val != 'none').filter(t.val != 'N/A').filter(t.val != 'normal').filter(t.val != 'anonymous').filter(t.val != 'null')

In [93]:
pred.show(10,truncate=False)

+-----------------------+------------------+----------+
|val                    |probability       |prediction|
+-----------------------+------------------+----------+
|AUTOBVT-IV5PHI2        |0.9152087674233372|1.0       |
|DEEP-2019OFISTD        |0.8221985482569627|1.0       |
|IDEA-20170821AZ        |0.6022177298000291|1.0       |
|gaoxinqu-0074.9c5b.e7fa|0.9989348193771349|1.0       |
|DEEP-2019QLTSAS        |0.8221985482569627|1.0       |
|AUTOBVT-5J3GA4H        |0.9148265314337913|1.0       |
|XZ-201705231143        |0.530341964118253 |1.0       |
|AUTOBVT-61AGAV1        |0.9148265314337913|1.0       |
|luke-HP-15-Notebook-PC |0.7408531144522307|1.0       |
|XCB-20160304VKM        |0.6022177298000291|1.0       |
+-----------------------+------------------+----------+
only showing top 10 rows



In [87]:
t.select('val',unlist('probability').alias('probability'),'prediction').groupBy('prediction').count().show(truncate=False)

+----------+-----+
|prediction|count|
+----------+-----+
|0.0       |5508 |
|1.0       |296  |
+----------+-----+



In [96]:
#pred_hdfs = pred.repartition(1)

#pred_hdfs.write.mode('overwrite').parquet('hdfs:///data/user/hive/warehouse/ian/prediction/p2',compression='gzip')

In [94]:
# cvModel_rf.save('hdfs:///data/user/hive/warehouse/ian/model/model4')
# m4 = cvModel_rf.load('hdfs:///data/user/hive/warehouse/ian/model/model4')

# demo

In [None]:
# 模型训练
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

# 模型预测
prediction = lrModel.transform(testData)

# ROC score
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(prediction)


In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())
evaluator = BinaryClassificationEvaluator()
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=10)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# 构建模型
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
# 十折交叉验证
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
                          .addGrid(rf.maxDepth, [3, 5, 7, 10])
                          .addGrid(rf.maxBins, [20, 30, 40])
                          .build())
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=10)
cvModel_rf = cv.fit(trainingData)

# 模型预测 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)
