In [1]:
# import findspark
# findspark.init()
# pyspark必须创建SparkSession才能像类似于pandas一样操作数据集
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('adult').getOrCreate()

In [2]:
# 文件带有表头
df = spark.read.csv('adult.csv', inferSchema = True, header=True)
df.show(3)

+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-----+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt| education|education-num|     marital-status|        occupation|  relationship|  race|  sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-----+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516| Bachelors|           13|      Never-married|      Adm-clerical| Not-in-family| White| Male|        2174|           0|            40| United-States| <=50K|
| 50| Self-emp-not-inc| 83311| Bachelors|           13| Married-civ-spouse|   Exec-managerial|       Husband| White| Male|           0|           0|            13| United-States| <=50K|
| 38|          Private|215646|   HS-grad|            9|           Divo

In [3]:
cols = df.columns

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [5]:
df.dtypes

[('age', 'int'),
 ('workclass', 'string'),
 ('fnlwgt', 'int'),
 ('education', 'string'),
 ('education-num', 'int'),
 ('marital-status', 'string'),
 ('occupation', 'string'),
 ('relationship', 'string'),
 ('race', 'string'),
 ('sex', 'string'),
 ('capital-gain', 'int'),
 ('capital-loss', 'int'),
 ('hours-per-week', 'int'),
 ('native-country', 'string'),
 ('income', 'string')]

In [6]:
cat_features = [item[0] for item in df.dtypes if item[1]=='string']
cat_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'income']

In [7]:
# 需要删除 income列，否则标签泄露
cat_features.remove('income')
cat_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [8]:
num_features = [item[0] for item in df.dtypes if item[1]!='string']
num_features

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

stages = []

In [10]:
for col in cat_features:
    # 字符串转成索引
    string_index = StringIndexer(inputCol = col, outputCol = col + 'Index')
    # 转换为OneHot编码
    encoder = OneHotEncoder(inputCols=[string_index.getOutputCol()], outputCols=[col + "_one_hot"])
    # 将每个字段的转换方式 放到stages中
    stages += [string_index, encoder]

In [11]:
# 将income转换为索引
label_string_index = StringIndexer(inputCol = 'income', outputCol = 'label')
# 添加到stages中
stages += [label_string_index]


In [12]:
# 类别变量 + 数值变量
assembler_cols = [c + "_one_hot" for c in cat_features] + num_features
assembler = VectorAssembler(inputCols=assembler_cols, outputCol="features")
stages += [assembler]

In [14]:
# 使用pipeline完成数据处理
pipeline = Pipeline(stages=stages)
pipeline_model = pipeline.fit(df)
df = pipeline_model.transform(df)
selected_cols = ["label", "features"] + cols
df = df.select(selected_cols)
df.show(3)


IllegalArgumentException: requirement failed: Output column label already exists.

In [15]:
import pandas as pd
pd.DataFrame(df.take(20), columns = df.columns)

Unnamed: 0,label,features,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,1.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,1.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [16]:
display(df)

DataFrame[label: double, features: vector, age: int, workclass: string, fnlwgt: int, education: string, education-num: int, marital-status: string, occupation: string, relationship: string, race: string, sex: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, income: string]

In [17]:
train, test = df.randomSplit([0.7, 0.3], seed=2021)
print(train.count())
print(test.count())


22795
9766


In [18]:
from pyspark.ml.classification import LogisticRegression
# 创建模型
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label',maxIter=10)
lr_model = lr.fit(train)

In [19]:
predictions = lr_model.transform(test)
predictions.take(1)

[Row(label=0.0, features=SparseVector(100, {0: 1.0, 8: 1.0, 23: 1.0, 29: 1.0, 43: 1.0, 48: 1.0, 52: 1.0, 53: 1.0, 94: 32.0, 95: 130304.0, 96: 9.0, 98: 1485.0, 99: 48.0}), age=32, workclass=' Private', fnlwgt=130304, education=' HS-grad', education-num=9, marital-status=' Married-civ-spouse', occupation=' Prof-specialty', relationship=' Husband', race=' White', sex=' Male', capital-gain=0, capital-loss=1485, hours-per-week=48, native-country=' United-States', income=' <=50K', rawPrediction=DenseVector([-0.1896, 0.1896]), probability=DenseVector([0.4527, 0.5473]), prediction=1.0)]

In [20]:
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [21]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)
selected.show(4)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

+-----+----------+--------------------+---+---------------+
|label|prediction|         probability|age|     occupation|
+-----+----------+--------------------+---+---------------+
|  0.0|       1.0|[0.45274048564839...| 32| Prof-specialty|
|  0.0|       1.0|[0.18385235468441...| 36| Prof-specialty|
|  0.0|       0.0|[0.70663779133990...| 26| Prof-specialty|
|  0.0|       0.0|[0.75021047636741...| 29| Prof-specialty|
+-----+----------+--------------------+---+---------------+
only showing top 4 rows



In [23]:
# rawPrediction 预测的原始数据，prediction分类结果
pd.DataFrame(predictions.take(4), columns = predictions.columns)

Unnamed: 0,label,features,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,rawPrediction,probability,prediction
0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",32,Private,130304,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1485,48,United-States,<=50K,"[-0.18960404004010267, 0.18960404004010267]","[0.4527404856483953, 0.5472595143516047]",1.0
1,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",36,Private,370767,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,2377,60,United-States,<=50K,"[-1.490462261179387, 1.490462261179387]","[0.18385235468441496, 0.816147645315585]",1.0
2,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",26,Private,340787,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K,"[0.8791101642782889, -0.8791101642782889]","[0.7066377913399012, 0.29336220866009877]",0.0
3,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",29,Private,40295,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K,"[1.0997351445251138, -1.0997351445251138]","[0.7502104763674187, 0.24978952363258133]",0.0


In [24]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# 模型评估，通过原始数据 rawPrediction计算AUC
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('AUC：', evaluator.evaluate(predictions))

AUC： 0.9062153434371653


In [25]:
evaluator.getMetricName()

'areaUnderROC'

In [26]:

print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [27]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# 创建网络参数，用于交叉验证
param_grid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [28]:
# 五折交叉验证，设置模型，网格参数，验证方法，折数
cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
# 交叉验证运行
cv_model = cv.fit(train)

In [29]:
# 查看cv有哪些参数可以调整
print(cv.explainParams())

estimator: estimator to be cross-validated (current: LogisticRegression_2ffa2f7a62da)
estimatorParamMaps: estimator param maps (current: [{Param(parent='LogisticRegression_2ffa2f7a62da', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_2ffa2f7a62da', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_2ffa2f7a62da', name='maxIter', doc='max number of iterations (>= 0).'): 1}, {Param(parent='LogisticRegression_2ffa2f7a62da', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_2ffa2f7a62da', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_2ffa2f7a62da', name='maxIter', doc='max number of iterations (

In [37]:
# 对于测试数据，使用五折交叉验证
predictions = cv_model.transform(test)
print('AUC：', evaluator.evaluate(predictions))

AUC： 0.9054096433333642


In [30]:

from pyspark.ml.classification import DecisionTreeClassifier

# 创建决策树模型
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dt_model = dt.fit(train)

In [31]:
print(dt_model._call_java('toDebugString'))

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3d9342284e19, depth=3, numNodes=11, numClasses=2, numFeatures=100
  If (feature 23 in {0.0})
   If (feature 97 <= 7792.0)
    Predict: 0.0
   Else (feature 97 > 7792.0)
    If (feature 94 <= 19.5)
     Predict: 0.0
    Else (feature 94 > 19.5)
     Predict: 1.0
  Else (feature 23 not in {0.0})
   If (feature 96 <= 12.5)
    If (feature 97 <= 3460.0)
     Predict: 0.0
    Else (feature 97 > 3460.0)
     Predict: 1.0
   Else (feature 96 > 12.5)
    Predict: 1.0



In [32]:
print("numNodes = ", dt_model.numNodes)
print("depth = ", dt_model.depth)

numNodes =  11
depth =  3


In [33]:
predictions = dt_model.transform(test)
predictions.printSchema()


root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [34]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [35]:

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.7451185461316444

In [36]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
param_grid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

In [37]:
# 设置五折交叉验证
cv = CrossValidator(estimator=dt, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
# 运行cv
cv_model = cv.fit(train)

In [38]:
# 查看最优模型
print("numNodes = ", cv_model.bestModel.numNodes)
print("depth = ", cv_model.bestModel.depth)

numNodes =  427
depth =  10


In [39]:
# 使用五折交叉验证进行预测
predictions = cv_model.transform(test)
evaluator.evaluate(predictions)

0.7841046678749348

In [40]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [41]:
selected.show(3)

+-----+----------+--------------------+---+---------------+
|label|prediction|         probability|age|     occupation|
+-----+----------+--------------------+---+---------------+
|  0.0|       0.0|[0.78571428571428...| 32| Prof-specialty|
|  0.0|       0.0|[0.88888888888888...| 36| Prof-specialty|
|  0.0|       0.0|[0.87179487179487...| 26| Prof-specialty|
+-----+----------+--------------------+---+---------------+
only showing top 3 rows



In [42]:
from pyspark.ml.classification import RandomForestClassifier
# 随机森林
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rf_model = rf.fit(train)
predictions = rf_model.transform(test)
predictions.printSchema()


root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [43]:

selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)


DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [44]:
selected.show(3)

+-----+----------+--------------------+---+---------------+
|label|prediction|         probability|age|     occupation|
+-----+----------+--------------------+---+---------------+
|  0.0|       0.0|[0.62977902198195...| 32| Prof-specialty|
|  0.0|       0.0|[0.57127844737169...| 36| Prof-specialty|
|  0.0|       0.0|[0.69996451574317...| 26| Prof-specialty|
+-----+----------+--------------------+---+---------------+
only showing top 3 rows



In [45]:

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)


0.8971021068780939

In [46]:
param_grid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [3, 5, 7])
             .addGrid(rf.maxBins, [20, 50])
             .addGrid(rf.numTrees, [5, 10])
             .build())

In [47]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
# 运行CV（大约6分钟）
cv_model = cv.fit(train)

In [48]:
predictions = cv_model.transform(test)
evaluator.evaluate(predictions)

0.8993895941899889

In [49]:

selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [50]:
selected.show(3)

+-----+----------+--------------------+---+---------------+
|label|prediction|         probability|age|     occupation|
+-----+----------+--------------------+---+---------------+
|  0.0|       0.0|[0.60121334722878...| 32| Prof-specialty|
|  0.0|       1.0|[0.46711906543176...| 36| Prof-specialty|
|  0.0|       0.0|[0.72454251069028...| 26| Prof-specialty|
+-----+----------+--------------------+---+---------------+
only showing top 3 rows



In [51]:

best_model = cv_model.bestModel
final_predictions = best_model.transform(df)
evaluator.evaluate(final_predictions)

0.8991549274847307

In [52]:
best_model

RandomForestClassificationModel: uid=RandomForestClassifier_2212896af696, numTrees=10, numClasses=2, numFeatures=100

In [53]:
best_model.featureImportances

SparseVector(100, {0: 0.0004, 1: 0.0013, 2: 0.0, 3: 0.0001, 4: 0.0001, 5: 0.0141, 6: 0.0004, 8: 0.0109, 9: 0.0009, 10: 0.0185, 11: 0.0227, 13: 0.0039, 14: 0.0002, 15: 0.0, 16: 0.0043, 17: 0.001, 18: 0.0012, 19: 0.0003, 20: 0.0122, 21: 0.0, 23: 0.1612, 24: 0.063, 25: 0.0072, 26: 0.0003, 27: 0.0001, 29: 0.0112, 30: 0.0001, 31: 0.0061, 32: 0.0005, 33: 0.0001, 34: 0.0118, 35: 0.0004, 36: 0.0006, 37: 0.0002, 38: 0.0011, 39: 0.0016, 40: 0.0001, 41: 0.0002, 43: 0.1296, 44: 0.0462, 45: 0.0497, 46: 0.0074, 47: 0.0131, 48: 0.0004, 49: 0.0005, 50: 0.0002, 51: 0.0002, 52: 0.0016, 53: 0.0003, 54: 0.0001, 55: 0.0001, 56: 0.0, 61: 0.0005, 62: 0.0, 63: 0.0, 65: 0.0001, 67: 0.0002, 68: 0.0, 71: 0.0005, 72: 0.0001, 94: 0.0741, 95: 0.0029, 96: 0.035, 97: 0.1833, 98: 0.037, 99: 0.0589})

In [54]:
df.schema['features'].metadata

{'ml_attr': {'attrs': {'numeric': [{'idx': 94, 'name': 'age'},
    {'idx': 95, 'name': 'fnlwgt'},
    {'idx': 96, 'name': 'education-num'},
    {'idx': 97, 'name': 'capital-gain'},
    {'idx': 98, 'name': 'capital-loss'},
    {'idx': 99, 'name': 'hours-per-week'}],
   'binary': [{'idx': 0, 'name': 'workclass_one_hot_ Private'},
    {'idx': 1, 'name': 'workclass_one_hot_ Self-emp-not-inc'},
    {'idx': 2, 'name': 'workclass_one_hot_ Local-gov'},
    {'idx': 3, 'name': 'workclass_one_hot_ ?'},
    {'idx': 4, 'name': 'workclass_one_hot_ State-gov'},
    {'idx': 5, 'name': 'workclass_one_hot_ Self-emp-inc'},
    {'idx': 6, 'name': 'workclass_one_hot_ Federal-gov'},
    {'idx': 7, 'name': 'workclass_one_hot_ Without-pay'},
    {'idx': 8, 'name': 'education_one_hot_ HS-grad'},
    {'idx': 9, 'name': 'education_one_hot_ Some-college'},
    {'idx': 10, 'name': 'education_one_hot_ Bachelors'},
    {'idx': 11, 'name': 'education_one_hot_ Masters'},
    {'idx': 12, 'name': 'education_one_hot_ Ass

In [55]:
temp = df.schema["features"].metadata["ml_attr"]["attrs"]
temp


{'numeric': [{'idx': 94, 'name': 'age'},
  {'idx': 95, 'name': 'fnlwgt'},
  {'idx': 96, 'name': 'education-num'},
  {'idx': 97, 'name': 'capital-gain'},
  {'idx': 98, 'name': 'capital-loss'},
  {'idx': 99, 'name': 'hours-per-week'}],
 'binary': [{'idx': 0, 'name': 'workclass_one_hot_ Private'},
  {'idx': 1, 'name': 'workclass_one_hot_ Self-emp-not-inc'},
  {'idx': 2, 'name': 'workclass_one_hot_ Local-gov'},
  {'idx': 3, 'name': 'workclass_one_hot_ ?'},
  {'idx': 4, 'name': 'workclass_one_hot_ State-gov'},
  {'idx': 5, 'name': 'workclass_one_hot_ Self-emp-inc'},
  {'idx': 6, 'name': 'workclass_one_hot_ Federal-gov'},
  {'idx': 7, 'name': 'workclass_one_hot_ Without-pay'},
  {'idx': 8, 'name': 'education_one_hot_ HS-grad'},
  {'idx': 9, 'name': 'education_one_hot_ Some-college'},
  {'idx': 10, 'name': 'education_one_hot_ Bachelors'},
  {'idx': 11, 'name': 'education_one_hot_ Masters'},
  {'idx': 12, 'name': 'education_one_hot_ Assoc-voc'},
  {'idx': 13, 'name': 'education_one_hot_ 11th'}

In [56]:
df_importance = pd.DataFrame(columns=['idx', 'name'])
for attr in temp['numeric']:
    temp_df = {}
    temp_df['idx'] = attr['idx']
    temp_df['name'] = attr['name']
    #print(temp_df)
    df_importance = df_importance.append(temp_df, ignore_index=True)
    #print(attr['idx'], attr['name'])
    #print(attr)
    #break
df_importance

Unnamed: 0,idx,name
0,94,age
1,95,fnlwgt
2,96,education-num
3,97,capital-gain
4,98,capital-loss
5,99,hours-per-week


In [57]:
for attr in temp['binary']:
    temp_df = {}
    temp_df['idx'] = attr['idx']
    temp_df['name'] = attr['name']
    df_importance = df_importance.append(temp_df, ignore_index=True)
df_importance


Unnamed: 0,idx,name
0,94,age
1,95,fnlwgt
2,96,education-num
3,97,capital-gain
4,98,capital-loss
...,...,...
95,89,native-country_one_hot_ Yugoslavia
96,90,native-country_one_hot_ Outlying-US(Guam-USVI-...
97,91,native-country_one_hot_ Honduras
98,92,native-country_one_hot_ Hungary


In [58]:
print(best_model.featureImportances)
df_temp = pd.DataFrame(best_model.featureImportances.toArray())
df_temp.columns = ['feature_importance']
df_importance = df_importance.merge(df_temp, left_index=True, right_index=True)
df_importance

(100,[0,1,2,3,4,5,6,8,9,10,11,13,14,15,16,17,18,19,20,21,23,24,25,26,27,29,30,31,32,33,34,35,36,37,38,39,40,41,43,44,45,46,47,48,49,50,51,52,53,54,55,56,61,62,63,65,67,68,71,72,94,95,96,97,98,99],[0.0004013020608441881,0.0012924445750508182,3.456435105995699e-05,5.588407803672825e-05,7.551565730848336e-05,0.014054308091170831,0.0003757789893767961,0.010904422126683355,0.0009152505604118992,0.018543266339354793,0.02274716984792387,0.003890486631660307,0.00019418188655910784,5.946059111961288e-07,0.004333444677787218,0.0009703866558887631,0.0011544764277259712,0.00026068463946552465,0.012178529612823775,5.495791071227507e-06,0.16117105387993502,0.06297763961833429,0.007210783109246159,0.00033595466889992803,5.003879265391791e-05,0.011160910370128094,6.889872257477638e-05,0.0061010276029625585,0.00046885683582710195,0.00012181544706172387,0.01179209648492181,0.00039729386956435553,0.0005590397481244079,0.00016483061397484187,0.0011255788207995149,0.0016093068216226102,8.059685706704206e-0

Unnamed: 0,idx,name,feature_importance
0,94,age,0.000401
1,95,fnlwgt,0.001292
2,96,education-num,0.000035
3,97,capital-gain,0.000056
4,98,capital-loss,0.000076
...,...,...,...
95,89,native-country_one_hot_ Yugoslavia,0.002880
96,90,native-country_one_hot_ Outlying-US(Guam-USVI-...,0.034956
97,91,native-country_one_hot_ Honduras,0.183321
98,92,native-country_one_hot_ Hungary,0.037049


In [60]:

df_importance.sort_values(by=['feature_importance'], ascending=False, inplace=True)
df_importance


Unnamed: 0,idx,name,feature_importance
97,91,native-country_one_hot_ Honduras,0.183321
23,17,education_one_hot_ Prof-school,0.161171
43,37,occupation_one_hot_ Transport-moving,0.129630
94,88,native-country_one_hot_ Thailand,0.074126
24,18,education_one_hot_ 9th,0.062978
...,...,...,...
81,75,native-country_one_hot_ Haiti,0.000000
7,1,workclass_one_hot_ Self-emp-not-inc,0.000000
82,76,native-country_one_hot_ Iran,0.000000
74,68,native-country_one_hot_ Dominican-Republic,0.000000
