In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import trim

from pyspark.sql import SQLContext

from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StringIndexer
from pyspark.ml.clustering import KMeansModel
from pyspark.ml import Pipeline

import pandas as pd

hc = HiveContext(sc)

In [2]:
data = hc.table("cust_exp_enc.n369087_cp_segmentation_Sub_base_v3")

In [4]:
data.columns

['individual_id',
 'individual_analytics_identifier',
 'proxy_id',
 'funding_category',
 'customer_sub_segment_code',
 'medical_plan_type',
 'gender',
 'age',
 'covered_dependents_count',
 'vision_product',
 'dental_product',
 'rx_ind',
 'pas_12_months_login',
 'action_atts',
 'call_count',
 'hdhp_flag',
 'high_deductible_flag',
 'pas_12_claims',
 'pas_12_med_cost',
 'pas_12_med_cost_inn',
 'pas_12_med_cost_oon',
 'pas_12_inn_visits',
 'pas_12_oon_visits',
 'pas_12_pcp_visits',
 'urbsubr',
 'employment_index',
 'a_hh_median_income',
 'higher_education_index',
 'physical_inactivity_index',
 'hpd_at_risk',
 'hpd_child_chronic',
 'hpd_behavioral_health',
 'hpd_specialty_chronic',
 'hpd_at_polychronic',
 'hpd_others',
 'hpd_category_count',
 'disease_count',
 'eng_ind',
 'soe_active_not_engaged',
 'complaint_count',
 'appeal_count',
 'claim_tenure',
 'member_tenure',
 'new_ind']

In [3]:
cols_to_keep = ['individual_id',
 'individual_analytics_identifier',
 'proxy_id',
 'funding_category',
 'customer_sub_segment_code',
 'medical_plan_type',
 'gender',
 'age',
 'covered_dependents_count',
 'vision_product',
 'dental_product',
 'rx_ind',
 'hdhp_flag',
 'high_deductible_flag',
 'urbsubr',
 'employment_index',
 'a_hh_median_income',
 'higher_education_index',
 'physical_inactivity_index',
 'new_ind'
]

In [39]:
data_clssfctn = data[cols_to_keep]

In [40]:
data_clssfctn.groupby("new_ind").agg(F.count("individual_analytics_identifier").alias("t_cnt"),\
                                    F.countDistinct("individual_analytics_identifier").alias("uni_cnt")).collect()

[Row(new_ind=None, t_cnt=65847, uni_cnt=65847),
 Row(new_ind=1, t_cnt=2524685, uni_cnt=2524685),
 Row(new_ind=0, t_cnt=5369927, uni_cnt=5369927)]

In [41]:
#4_clusters
data_clustrng = hc.table("cust_exp_enc.n275675_cp_segmentation_base_prd_V1")

In [42]:
data_clssfctn_1 = data_clssfctn.join(data_clustrng.select("individual_analytics_identifier", "prediction"),\
                                     "individual_analytics_identifier", "inner")

In [14]:
data_clssfctn_1.show()

+-------------------------------+----------------+------------+----------------+-------------------------+------+---+------------------------+--------------+--------------+------+---------+--------------------+-------+----------------+------------------+----------------------+-------------------------+-------+----------+
|individual_analytics_identifier|   individual_id|    proxy_id|funding_category|customer_sub_segment_code|gender|age|covered_dependents_count|vision_product|dental_product|rx_ind|hdhp_flag|high_deductible_flag|urbsubr|employment_index|a_hh_median_income|higher_education_index|physical_inactivity_index|new_ind|prediction|
+-------------------------------+----------------+------------+----------------+-------------------------+------+---+------------------------+--------------+--------------+------+---------+--------------------+-------+----------------+------------------+----------------------+-------------------------+-------+----------+
|                       1000010

In [10]:
data_clssfctn_1.count()#7,828,256

2501368

In [43]:
data_clssfctn_1.write.option("sep","|").format('orc').mode("overwrite").saveAsTable("cust_exp_enc.n201366_segmentation_2020_nw_mbrs_clssfctn_inpv1_1")

In [44]:
inp_data = hc.table("cust_exp_enc.n201366_segmentation_2020_nw_mbrs_clssfctn_inpv1_1")

In [45]:
def drop_multiple_values(df, col_list):
    for item in col_list:
        df = df.filter(~(F.col(item).like('%/%')))
    return df

In [46]:
#Updating urban value in urban suburban indicator :
inp_data = inp_data.withColumn("urbsubr", when(trim(col("urbsubr")) == "U", "Urb").otherwise(col("urbsubr")))

In [47]:
def drop_unknown_n_nulls(df,col_list):
    for item in col_list:
        
        df = df.withColumn(item, when(F.lower(F.col(item)) == 'u', None).otherwise(F.col(item)))
        df = df.withColumn(item, when(trim(col(item)) == '',None).otherwise(col(item)))
    df = df.na.drop()
    return df

def string_to_num(df, col_list):
    for item in col_list:
        categories = df.select(item).distinct().rdd.flatMap(lambda x: x).collect()

        print categories
        exprs = [(F.when(F.col(item) == category, 1).otherwise(0)).alias(item+"_"+category.replace(" ","").\
                                                                         replace("(","").replace(")","")) for category in categories]

        df = df.select('*', *exprs)
        
    return df

In [48]:
def preprocess_part1(data, cols_to_keep):
    data_1 = drop_multiple_values(data, cols_to_keep)
    #print data_1.count()
    
    data_2 = drop_unknown_n_nulls(data_1, cols_to_keep)
    #print data_2.count()

    
    
    #data_3 = string_to_num(data_2,cat_list)
    #print data_3.count()
    
    return data_2

In [49]:
cat_list = ['funding_category',
                 'customer_sub_segment_code',
                 'gender',
                 'urbsubr',
           'medical_plan_type']

In [50]:
data_pre_processed = preprocess_part1(inp_data,cols_to_keep )
data_pre_processed.show(5)

+-------------------------------+-------------+------------+----------------+-------------------------+-----------------+------+---+------------------------+--------------+--------------+------+---------+--------------------+-------+----------------+------------------+----------------------+-------------------------+-------+----------+
|individual_analytics_identifier|individual_id|    proxy_id|funding_category|customer_sub_segment_code|medical_plan_type|gender|age|covered_dependents_count|vision_product|dental_product|rx_ind|hdhp_flag|high_deductible_flag|urbsubr|employment_index|a_hh_median_income|higher_education_index|physical_inactivity_index|new_ind|prediction|
+-------------------------------+-------------+------------+----------------+-------------------------+-----------------+------+---+------------------------+--------------+--------------+------+---------+--------------------+-------+----------------+------------------+----------------------+-------------------------+------

In [51]:
data_pre_processed.write.option("sep","|").format('orc').mode("overwrite").saveAsTable("cust_exp_enc.n201366_segmentation_2020_nw_mbrs_clssfctn_inpv2_2")

In [2]:
post_pre_process = hc.table("cust_exp_enc.n201366_segmentation_2020_nw_mbrs_clssfctn_inpv2_2")

In [3]:
to_scale_features = ['age',
             'covered_dependents_count',
                     'a_hh_median_income']

In [None]:
_scaled_pre_process = post_pre_process
for item in to_scale_features:
    vec_assembler = VectorAssembler(inputCols= [item] , outputCol = "vec"+item)
    scaler = StandardScaler(inputCol="vec"+item, outputCol= 'scaled_'+item)
    vec_scaler_pipeline = Pipeline(stages=[vec_assembler, scaler])
    scaled = vec_scaler_pipeline.fit(_scaled_pre_process)
    _scaled_pre_process = scaled.transform(_scaled_pre_process)
    

In [None]:
data_scaled = _scaled_pre_process.drop('age',
                                       'covered_dependents_count',
                                       'a_hh_median_income',
                                       'vecage',
                                       'veccovered_dependents_count',
                                       'veca_hh_median_income'
                                      )
for item in to_scale_features:
    
    data_scaled = data_scaled.withColumnRenamed("scaled_"+item, item)

In [None]:
data_scaled.columns

In [57]:
cat_list

['funding_category',
 'customer_sub_segment_code',
 'gender',
 'urbsubr',
 'medical_plan_type']

In [60]:
con_list = ['vision_product',
            'dental_product',
            'rx_ind',
            'hdhp_flag',
            'high_deductible_flag',
            'employment_index',
            'higher_education_index',
            'physical_inactivity_index',
            'new_ind',
            'age',
            'covered_dependents_count',
            'a_hh_median_income'
           ]
labelcol = 'prediction'
indexcol = 'individual_analytics_identifier'

### Handling categorical variables & creating features vector

In [65]:
def get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol):

    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col

    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn('label',col(labelCol))

    return data

In [69]:
model_input.columns

['individual_analytics_identifier',
 'individual_id',
 'proxy_id',
 'funding_category',
 'customer_sub_segment_code',
 'medical_plan_type',
 'gender',
 'vision_product',
 'dental_product',
 'rx_ind',
 'hdhp_flag',
 'high_deductible_flag',
 'urbsubr',
 'employment_index',
 'higher_education_index',
 'physical_inactivity_index',
 'new_ind',
 'prediction',
 'age',
 'covered_dependents_count',
 'a_hh_median_income',
 'funding_category_indexed',
 'customer_sub_segment_code_indexed',
 'gender_indexed',
 'urbsubr_indexed',
 'medical_plan_type_indexed',
 'funding_category_indexed_encoded',
 'customer_sub_segment_code_indexed_encoded',
 'gender_indexed_encoded',
 'urbsubr_indexed_encoded',
 'medical_plan_type_indexed_encoded',
 'features',
 'label']

In [66]:
model_input = get_dummy(data_scaled,indexcol,cat_list,con_list,labelcol)

In [None]:
#Ref Col:
funding_category: C
customer_sub_segment_code: BOA
medical_plan_type: Indemnity Medical
gender: F
urbsubr: S

In [73]:
print model_input.select('funding_category','funding_category_indexed_encoded').distinct().show()
print model_input.select('customer_sub_segment_code', 'customer_sub_segment_code_indexed_encoded').distinct().show()
print model_input.select('medical_plan_type','medical_plan_type_indexed_encoded').distinct().show()
print model_input.select('gender','gender_indexed_encoded').distinct().show()
print model_input.select('urbsubr','urbsubr_indexed_encoded').distinct().show()

+----------------+--------------------------------+
|funding_category|funding_category_indexed_encoded|
+----------------+--------------------------------+
|               C|                       (2,[],[])|
|               B|                   (2,[0],[1.0])|
|               A|                   (2,[1],[1.0])|
+----------------+--------------------------------+

None
+-------------------------+-----------------------------------------+
|customer_sub_segment_code|customer_sub_segment_code_indexed_encoded|
+-------------------------+-----------------------------------------+
|                       NA|                            (6,[0],[1.0])|
|                      SEL|                            (6,[3],[1.0])|
|                      NAG|                            (6,[2],[1.0])|
|                      FED|                            (6,[5],[1.0])|
|                       SG|                            (6,[4],[1.0])|
|                      KEY|                            (6,[1],[1.0])|


In [83]:
model_input_1 = model_input.select("individual_analytics_identifier","new_ind","features", "label")

In [84]:
model_input_1.write.option("sep","|").format('orc').mode("overwrite").saveAsTable("cust_exp_enc.n201366_segmentation_2020_nw_mbrs_clssfctn_inpv3_3")

In [85]:
model_inp = hc.table("cust_exp_enc.n201366_segmentation_2020_nw_mbrs_clssfctn_inpv3_3")

In [168]:
train, test = model_inp.randomSplit([0.7, 0.3], seed = 50)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 5442085
Test Dataset Count: 2334584


In [169]:
from pyspark.ml.classification import LogisticRegression

In [170]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

In [171]:
train_predictions  = lrModel.transform(train)

In [172]:
test_predictions  = lrModel.transform(test)

In [173]:
print evaluator.evaluate(train_predictions)
print evaluator.evaluate(test_predictions)

0.786088428251
0.786074210609


In [90]:
train_predictions.select('individual_analytics_identifier', 'label', 'rawPrediction','probability', 'prediction').show(10)

+-------------------------------+-----+--------------------+--------------------+----------+
|individual_analytics_identifier|label|       rawPrediction|         probability|prediction|
+-------------------------------+-----+--------------------+--------------------+----------+
|                      100001429|    1|[-3.8815428363057...|[9.83973344110295...|       1.0|
|                      100005390|    1|[-2.6808948165222...|[0.00678951406410...|       3.0|
|                      100006271|    3|[-1.3718995945003...|[0.03373148351050...|       3.0|
|                      100007383|    2|[-2.0227824327116...|[1.43299474474640...|       2.0|
|                      100007493|    1|[-4.1549283660421...|[3.37475693096345...|       1.0|
|                      100008521|    2|[-4.1383062673373...|[3.71105075769068...|       1.0|
|                      100008817|    1|[-5.6483357814647...|[2.73167733178818...|       1.0|
|                      100009378|    1|[-4.8874226844012...|[8.8013976

In [91]:
train_crschck = train_predictions.groupby("new_ind","label","prediction").count().toPandas()

In [92]:
train_crschck.sort_values(by = ["label"])

Unnamed: 0,new_ind,label,prediction,count
9,0,0,3.0,344146
3,0,0,0.0,53618
17,1,0,2.0,54485
6,0,0,1.0,7070
7,1,0,0.0,990131
10,0,0,2.0,5579
1,0,1,3.0,80317
2,0,1,2.0,53150
8,0,1,1.0,1318428
16,0,2,0.0,51


In [93]:

test_crschck = test_predictions.groupby("new_ind","label","prediction").count().toPandas()
test_crschck.sort_values(by = ["label"])

Unnamed: 0,new_ind,label,prediction,count
9,0,0,3.0,147234
3,0,0,0.0,23061
17,1,0,2.0,23640
6,0,0,1.0,3048
7,1,0,0.0,424018
10,0,0,2.0,2424
1,0,1,3.0,34218
2,0,1,2.0,22689
8,0,1,1.0,565335
16,0,2,0.0,25


In [94]:
print "Coefficients: \n" + str(lrModel.coefficientMatrix)

Coefficients: 
DenseMatrix([[  2.18154165e-01,   4.78341096e-01,   3.54696944e-01,
                4.35297430e-01,  -2.76210205e-01,   4.26787254e-01,
                7.22021903e-01,  -1.54116208e+00,   1.75597338e-01,
                3.92843033e-01,   1.72763812e-01,   7.81739469e-01,
                2.88887594e-01,   4.92271448e-02,  -1.69325705e-02,
                2.51011791e-02,  -1.20700367e-01,   3.56610731e-01,
               -8.39744541e-03,  -2.81018630e-01,   7.82509704e+00,
                1.95134563e-01,   1.64100117e+00,   4.54362456e+00,
               -2.44962861e+00,  -2.69670485e-01,  -1.93400421e-01],
             [ -1.59282511e-01,  -6.66521828e-01,  -2.67269120e-01,
               -4.66936068e-01,  -2.20647325e-01,  -5.13079812e-01,
               -8.24317904e-01,   2.47141066e+00,  -1.44076299e-01,
               -4.67901894e-01,  -2.66653095e-01,  -8.85856837e-01,
               -3.74510785e-01,  -1.62018323e-01,   7.54762661e-04,
               -2.03470792e-02, 

In [95]:
# Feature selection using chisquareSelector
from pyspark.ml.feature import ChiSqSelector
css = ChiSqSelector(featuresCol='features',outputCol='Aspect',labelCol='label',fpr=0.05)
train=css.fit(train).transform(train)
test=css.fit(test).transform(test)
test.select("Aspect").show(5,truncate=False)

Py4JJavaError: An error occurred while calling o2104.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 242.0 failed 4 times, most recent failure: Lost task 0.3 in stage 242.0 (TID 16050, xhadoopm1379p.aetna.com, executor 1389): org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 26.
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1$$anonfun$apply$2.apply(ChiSqTest.scala:106)
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1$$anonfun$apply$2.apply(ChiSqTest.scala:104)
	at scala.collection.immutable.HashMap$HashMap1.foreach(HashMap.scala:221)
	at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:428)
	at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:428)
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1.apply(ChiSqTest.scala:104)
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1.apply(ChiSqTest.scala:98)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:193)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:375)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:375)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.countByKey(PairRDDFunctions.scala:374)
	at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1204)
	at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1204)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.countByValue(RDD.scala:1203)
	at org.apache.spark.mllib.stat.test.ChiSqTest$.chiSquaredFeatures(ChiSqTest.scala:120)
	at org.apache.spark.mllib.stat.Statistics$.chiSqTest(Statistics.scala:176)
	at org.apache.spark.mllib.feature.ChiSqSelector.fit(ChiSqSelector.scala:235)
	at org.apache.spark.ml.feature.ChiSqSelector.fit(ChiSqSelector.scala:170)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Chi-square test expect factors (categorical values) but found more than 10000 distinct values in column 26.
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1$$anonfun$apply$2.apply(ChiSqTest.scala:106)
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1$$anonfun$apply$2.apply(ChiSqTest.scala:104)
	at scala.collection.immutable.HashMap$HashMap1.foreach(HashMap.scala:221)
	at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:428)
	at scala.collection.immutable.HashMap$HashTrieMap.foreach(HashMap.scala:428)
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1.apply(ChiSqTest.scala:104)
	at org.apache.spark.mllib.stat.test.ChiSqTest$$anonfun$3$$anonfun$apply$1.apply(ChiSqTest.scala:98)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:193)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [97]:
train.show()

+-------------------------------+-------+--------------------+-----+
|individual_analytics_identifier|new_ind|            features|label|
+-------------------------------+-------+--------------------+-----+
|                       10000031|      0|(27,[0,2,8,10,11,...|    1|
|                      100000531|      0|(27,[0,4,8,12,17,...|    3|
|                      100002342|      0|(27,[0,4,9,11,15,...|    3|
|                      100002947|      0|(27,[0,2,9,11,16,...|    3|
|                       10000328|      0|(27,[0,2,10,11,18...|    2|
|                      100003456|      1|(27,[0,2,8,11,20,...|    2|
|                       10000368|      0|(27,[0,2,8,11,20,...|    1|
|                      100003915|      0|(27,[0,4,11,15,16...|    1|
|                      100004198|      0|(27,[0,2,8,9,11,2...|    1|
|                      100004287|      0|(27,[0,2,8,10,11,...|    1|
|                      100004528|      0|(27,[0,3,9,11,16,...|    3|
|                      100005128| 

### Feature selection: Trial Version

In [111]:
importance_list = pd.Series(lrModel.coefficientMatrix.toArray()[0])
sorted_imp = importance_list.sort_values(ascending= False)
kept = list((sorted_imp[sorted_imp > 0.03]).index)
from pyspark.ml.feature import VectorSlicer
vector_slicer = VectorSlicer(inputCol= "features", indices= kept, outputCol= "feature_subset")
with_selected_feature = vector_slicer.transform(model_input_1)
with_selected_feature.show(truncate=False)
lr_subset = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel_ss = lr_subset.fit(train)

In [182]:
sorted_imp

20    7.825097
23    4.543625
22    1.641001
11    0.781739
6     0.722022
1     0.478341
3     0.435297
5     0.426787
9     0.392843
17    0.356611
2     0.354697
12    0.288888
0     0.218154
21    0.195135
8     0.175597
10    0.172764
13    0.049227
15    0.025101
18   -0.008397
14   -0.016933
16   -0.120700
26   -0.193400
25   -0.269670
4    -0.276210
19   -0.281019
7    -1.541162
24   -2.449629
dtype: float64

In [118]:
lr_subset = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel_ss = lr_subset.fit(train)

In [119]:
train_predictions_ss  = lrModel_ss.transform(train)
test_predictions_ss  = lrModel_ss.transform(test)

In [121]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator as MCE
evaluator = MCE()

In [122]:
print evaluator.evaluate(train_predictions_ss)
print evaluator.evaluate(test_predictions_ss)

0.785831900805
0.786046466313


### Cross Validation & Parameter Tuning

In [137]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [145]:
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [146]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

In [147]:
cvModel = cv.fit(train)
prediction = cvModel.transform(test)


In [148]:
evaluator.evaluate(prediction)

0.7862055650361585

In [176]:
cvModel.bestModel.coefficientMatrix.toArray()

array([[  1.62585346e-01,   2.88983287e-01,   2.30580164e-01,
          2.66366670e-01,  -1.70252750e-01,   2.54847897e-01,
          4.78552552e-01,  -9.39517838e-01,   1.15811930e-01,
          2.48982489e-01,   1.20944902e-01,   5.17602574e-01,
          1.92771501e-01,   2.95027657e-03,  -2.86615665e-02,
         -1.75722700e-03,  -9.26429658e-02,   2.14650450e-01,
         -6.74615368e-03,  -1.86233051e-01,   5.04151616e+00,
          1.10973690e-01,   1.08707564e+00,   2.90429446e+00,
         -1.60106183e+00,  -1.85791396e-01,  -1.30685571e-01],
       [ -9.87368525e-02,  -4.33569652e-01,  -1.43607660e-01,
         -2.94003891e-01,  -1.55729287e-01,  -3.32062630e-01,
         -5.76566601e-01,   1.49830750e+00,  -8.79471807e-02,
         -2.86562136e-01,  -1.93920249e-01,  -5.78865253e-01,
         -2.48378896e-01,  -7.75651968e-02,   2.84576639e-02,
          7.90626139e-03,   1.25433466e-01,  -2.65288922e-01,
          5.61108164e-02,   1.70225166e-01,  -5.82603214e+00,
       

In [159]:
bestModel = cvModel.bestModel

In [162]:
print 'Best Param (regParam): ', bestModel._java_obj.getRegParam()
print 'Best Param (ElasticNetParam): ', bestModel._java_obj.getElasticNetParam()
print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter()

Best Param (regParam):  0.01
Best Param (ElasticNetParam):  0.0
Best Param (MaxIter):  10


In [177]:
train_predictions

DataFrame[individual_analytics_identifier: string, new_ind: int, features: vector, label: int, rawPrediction: vector, probability: vector, prediction: double]

In [180]:
train_merge = model_input.join(train_predictions, "individual_analytics_identifier", "inner")

In [181]:
train_merge.show()

+-------------------------------+----------------+------------+----------------+-------------------------+-----------------+------+--------------+--------------+------+---------+--------------------+-------+----------------+----------------------+-------------------------+-------+----------+--------------------+------------------------+--------------------+------------------------+---------------------------------+--------------+---------------+-------------------------+--------------------------------+-----------------------------------------+----------------------+-----------------------+---------------------------------+--------------------+-----+-------+--------------------+-----+--------------------+--------------------+----------+
|individual_analytics_identifier|   individual_id|    proxy_id|funding_category|customer_sub_segment_code|medical_plan_type|gender|vision_product|dental_product|rx_ind|hdhp_flag|high_deductible_flag|urbsubr|employment_index|higher_education_index|physic

#### Chi Square Tests for Binary, discrete and nominal

In [184]:
con_list_wo_income = [x for x in con_list if x != 'a_hh_median_income']

In [185]:
def string_to_num(df, col_list):
    for item in col_list:
        categories = df.select(item).distinct().rdd.flatMap(lambda x: x).collect()

        print categories
        exprs = [(F.when(F.col(item) == categories[i], 1).otherwise(0)).\
                 alias(item+"_"+categories[i].replace(" ","").\
                 replace("(","").replace(")","")) for i in range(0, len(categories)-1)]

        df = df.select('*', *exprs)
        
    return df

In [186]:
data_scaled.show()

Py4JJavaError: An error occurred while calling o1502.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3067.0 failed 4 times, most recent failure: Lost task 0.3 in stage 3067.0 (TID 35311, xhadoopm775p.aetna.com, executor 1447): org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.token.SecretManager$InvalidToken): token (HDFS_DELEGATION_TOKEN token 110829757 for n201366) is expired
	at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1554)
	at org.apache.hadoop.ipc.Client.call(Client.java:1498)
	at org.apache.hadoop.ipc.Client.call(Client.java:1398)
	at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:233)
	at com.sun.proxy.$Proxy19.getFileInfo(Unknown Source)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:818)
	at sun.reflect.GeneratedMethodAccessor36.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:291)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:203)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:185)
	at com.sun.proxy.$Proxy20.getFileInfo(Unknown Source)
	at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:2165)
	at org.apache.hadoop.hdfs.DistributedFileSystem$26.doCall(DistributedFileSystem.java:1442)
	at org.apache.hadoop.hdfs.DistributedFileSystem$26.doCall(DistributedFileSystem.java:1438)
	at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
	at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1438)
	at org.apache.spark.deploy.SparkHadoopUtil.listLeafStatuses(SparkHadoopUtil.scala:201)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.listOrcFiles(OrcFileOperator.scala:94)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.getFileReader(OrcFileOperator.scala:67)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$$anonfun$readSchema$1.apply(OrcFileOperator.scala:77)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$$anonfun$readSchema$1.apply(OrcFileOperator.scala:77)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.immutable.List.flatMap(List.scala:344)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.readSchema(OrcFileOperator.scala:77)
	at org.apache.spark.sql.hive.orc.OrcFileFormat$$anonfun$buildReader$2.apply(OrcFileFormat.scala:162)
	at org.apache.spark.sql.hive.orc.OrcFileFormat$$anonfun$buildReader$2.apply(OrcFileFormat.scala:156)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:138)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:122)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:168)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:333)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2386)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2385)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2392)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2128)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2127)
	at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2818)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2127)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2342)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)
	at sun.reflect.GeneratedMethodAccessor214.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.token.SecretManager$InvalidToken): token (HDFS_DELEGATION_TOKEN token 110829757 for n201366) is expired
	at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1554)
	at org.apache.hadoop.ipc.Client.call(Client.java:1498)
	at org.apache.hadoop.ipc.Client.call(Client.java:1398)
	at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:233)
	at com.sun.proxy.$Proxy19.getFileInfo(Unknown Source)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:818)
	at sun.reflect.GeneratedMethodAccessor36.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:291)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:203)
	at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:185)
	at com.sun.proxy.$Proxy20.getFileInfo(Unknown Source)
	at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:2165)
	at org.apache.hadoop.hdfs.DistributedFileSystem$26.doCall(DistributedFileSystem.java:1442)
	at org.apache.hadoop.hdfs.DistributedFileSystem$26.doCall(DistributedFileSystem.java:1438)
	at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
	at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1438)
	at org.apache.spark.deploy.SparkHadoopUtil.listLeafStatuses(SparkHadoopUtil.scala:201)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.listOrcFiles(OrcFileOperator.scala:94)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.getFileReader(OrcFileOperator.scala:67)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$$anonfun$readSchema$1.apply(OrcFileOperator.scala:77)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$$anonfun$readSchema$1.apply(OrcFileOperator.scala:77)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.immutable.List.flatMap(List.scala:344)
	at org.apache.spark.sql.hive.orc.OrcFileOperator$.readSchema(OrcFileOperator.scala:77)
	at org.apache.spark.sql.hive.orc.OrcFileFormat$$anonfun$buildReader$2.apply(OrcFileFormat.scala:162)
	at org.apache.spark.sql.hive.orc.OrcFileFormat$$anonfun$buildReader$2.apply(OrcFileFormat.scala:156)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:138)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:122)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:168)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
model_input = get_dummy(data_scaled,indexcol,cat_list,con_list,labelcol)

In [None]:
def get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol):

    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col

    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn('label',col(labelCol))

    return data