### 分类

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("ClassificationExample")\
        .getOrCreate()

#### 1. SVM（Support Vector Machine）

In [2]:
from pyspark.ml.classification import LinearSVC

In [3]:
training = spark.read.format("libsvm")\
                     .load("../../data/sample_libsvm_data.txt")

In [4]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

In [5]:
lsvcModel = lsvc.fit(training)

In [6]:
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0005170630317473441,-0.00011722886549737355,-8.882754836918941e-05,8.522360710187458e-05,0.0,0.0,-1.3436361263314255e-05,0.0003729569801338088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0008888949552633654,0.0002986405976181267,0.00037933788161931595,-0.00017623288982540798,0.0,1.502848926974781e-06,1.805604114494666e-06,1.8028763260398567e-06,-3.3843713506473667e-06,-4.041580184807505e-06,2.09650177270151e-06,8.53611164298949e-05,0.00022064177429604456,0.00021677599940575439,-0.0005472401396558765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0009214155024071468,0.000

#### 2. 决策树

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [8]:
data = spark.read.format("libsvm")\
                     .load("../../data/sample_libsvm_data.txt")

In [9]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

In [11]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)\
    .fit(data)

In [12]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [13]:
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [14]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

In [15]:
model = pipeline.fit(trainingData)

In [16]:
predictions = model.transform(testData)

In [17]:
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(692,[95,96,97,12...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[125,126,127...|
|       1.0|         1.0|(692,[126,127,128...|
+----------+------------+--------------------+
only showing top 5 rows



In [18]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0 


In [19]:
treeModel = model.stages[2]
print(treeModel)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_84b887767f7b, depth=2, numNodes=5, numClasses=2, numFeatures=692


#### 3. 随机森林

In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [21]:
data = spark.read.format("libsvm")\
                     .load("../../data/sample_libsvm_data.txt")

In [23]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")\
               .fit(data)

In [24]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)\
    .fit(data)

In [25]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [26]:
rf = RandomForestClassifier(
    labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [27]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [28]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

In [29]:
model = pipeline.fit(trainingData)

In [30]:
predictions = model.transform(testData)

In [31]:
predictions.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(692,[122,123,124...|
|           0.0|  0.0|(692,[123,124,125...|
|           0.0|  0.0|(692,[124,125,126...|
|           0.0|  0.0|(692,[126,127,128...|
|           0.0|  0.0|(692,[126,127,128...|
+--------------+-----+--------------------+
only showing top 5 rows



In [32]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0


In [33]:
rfModel = model.stages[2]
print(rfModel)

RandomForestClassificationModel: uid=RandomForestClassifier_9918e04f7860, numTrees=10, numClasses=2, numFeatures=692
