In [1]:
from pyspark.sql import DataFrame
from pyspark.sql import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.tuning import *
from pyspark.ml.feature import Imputer

In [2]:
import numpy as np
import pandas as pd
import scipy as sc

In [3]:
dat = spark.read.csv("/user/dk444/train.csv",header=True, inferSchema=True)

In [5]:
test = spark.read.csv("/user/dk444/test.csv", header = True, inferSchema=True)

In [4]:
print(len(dat.columns))
print(dat.count())
dat.rdd.getNumPartitions()

133
114321


3

In [8]:
#repartition to 5 blocks
dat = dat.repartition(8)
test = dat.repartition(8)

In [9]:
dat.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- v1: double (nullable = true)
 |-- v2: double (nullable = true)
 |-- v3: string (nullable = true)
 |-- v4: double (nullable = true)
 |-- v5: double (nullable = true)
 |-- v6: double (nullable = true)
 |-- v7: double (nullable = true)
 |-- v8: double (nullable = true)
 |-- v9: double (nullable = true)
 |-- v10: double (nullable = true)
 |-- v11: double (nullable = true)
 |-- v12: double (nullable = true)
 |-- v13: double (nullable = true)
 |-- v14: double (nullable = true)
 |-- v15: double (nullable = true)
 |-- v16: double (nullable = true)
 |-- v17: double (nullable = true)
 |-- v18: double (nullable = true)
 |-- v19: double (nullable = true)
 |-- v20: double (nullable = true)
 |-- v21: double (nullable = true)
 |-- v22: string (nullable = true)
 |-- v23: double (nullable = true)
 |-- v24: string (nullable = true)
 |-- v25: double (nullable = true)
 |-- v26: double (nullable = true)
 |-- v27: double (nu

In [10]:
dat = dat.drop('ID')
test = test.drop('ID')

In [11]:
# specify the number of rows to display by using the take()
dat.take(1)

[Row(target=0, v1=1.89126321222, v2=5.07308380726, v3='C', v4=4.53577352927, v5=10.0828062908, v6=2.93680250997, v7=3.03903435756, v8=0.399666460575, v9=7.81954975863, v10=0.568927407623, v11=14.9999998712, v12=6.24109754675, v13=3.29196682523, v14=8.44685583951, v15=1.5199164593, v16=6.34482710368, v17=3.58590890437, v18=0.613470427774, v19=0.199895316332, v20=17.3551127837, v21=4.35819923434, v22='SWO', v23=-7.32699824992e-07, v24='E', v25=0.614382450625, v26=2.23203214616, v27=3.25278730828, v28=4.59910229022, v29=7.38847542283, v30='C', v31='A', v32=1.44981512376, v33=2.69516755249, v34=6.58163194177, v35=7.52788177704, v36=13.5361298373, v37=0.672783295774, v38=0, v39=0.199415650816, v40=8.86621816591, v41=6.55204512588, v42=13.0018595857, v43=2.78723337437, v44=9.07386356584, v45=10.0510205796, v46=0.609217121791, v47='C', v48=12.5992076123, v49=7.49070586136, v50=0.60156982581, v51=6.1563412669, v52='D', v53=15.4968562468, v54=0.545802944473, v55=2.03531627757, v56='AS', v57=4.5

In [12]:
# how to count unique classes in a column
dat.select(col("target")).distinct().count()

2

In [13]:
# we can do an aggregate count to see the distribution of the classes
dat.groupBy(col("target")).count().collect()

[Row(target=1, count=87021), Row(target=0, count=27300)]

In [14]:
#we can do the same thing in sql code
# we must first create a "view" of the data in order to execute SQL functions
dat.createOrReplaceTempView("dat")
spark.sql("SELECT COUNT(*) FROM dat GROUP BY Target").collect()

[Row(count(1)=87021), Row(count(1)=27300)]

In [11]:
#Convert string types(categorical) to integerTypes
types = [str(f.dataType) for f in dat.schema.fields]
ind_str = [i for i in range(len(types)) if types[i] is 'StringType']
ind_int = [i for i in range(len(types)) if types[i] is 'IntegerType']
stringcol= [dat.columns[i] for i in ind_str]
intcol = [dat.columns[i] for i in ind_int]
print(stringcol)
print(intcol)
del ind_str, ind_int, types

['v3', 'v22', 'v24', 'v30', 'v31', 'v47', 'v52', 'v56', 'v66', 'v71', 'v74', 'v75', 'v79', 'v91', 'v107', 'v110', 'v112', 'v113', 'v125']
['target', 'v38', 'v62', 'v72', 'v129']


In [12]:
# remove all string columns
for colname in stringcol:
    dat = dat.drop(colname)
    test = test.drop(colname)
    
# remove 'target' from inttype list
intcol = intcol[1:]

In [13]:
intcol

['v38', 'v62', 'v72', 'v129']

In [14]:
#Imputer requires that all columns it uses be floattype or doubletype, so let's convert ints to floats
for colname in intcol:
    dat = dat.withColumn( colname+"cast", col(colname).cast("float"))
    dat = dat.drop(colname)
    test = test.withColumn( colname+"cast", col(colname).cast("float"))
    test = test.drop(colname)
    

In [15]:
#Impute missing values for both dat and test
#Let's actually impute now!
newcolnames = [colname+"new" for colname in dat.columns[1:]]
imp = Imputer(strategy = 'mean',missingValue=None, inputCols = dat.columns[1:],outputCols=newcolnames )
model = imp.fit(dat)

In [16]:
dat = model.transform(dat)
test = model.transform(test)

In [17]:
features = dat.columns[109:]
features

['v38cast',
 'v62cast',
 'v72cast',
 'v129cast',
 'v1new',
 'v2new',
 'v4new',
 'v5new',
 'v6new',
 'v7new',
 'v8new',
 'v9new',
 'v10new',
 'v11new',
 'v12new',
 'v13new',
 'v14new',
 'v15new',
 'v16new',
 'v17new',
 'v18new',
 'v19new',
 'v20new',
 'v21new',
 'v23new',
 'v25new',
 'v26new',
 'v27new',
 'v28new',
 'v29new',
 'v32new',
 'v33new',
 'v34new',
 'v35new',
 'v36new',
 'v37new',
 'v39new',
 'v40new',
 'v41new',
 'v42new',
 'v43new',
 'v44new',
 'v45new',
 'v46new',
 'v48new',
 'v49new',
 'v50new',
 'v51new',
 'v53new',
 'v54new',
 'v55new',
 'v57new',
 'v58new',
 'v59new',
 'v60new',
 'v61new',
 'v63new',
 'v64new',
 'v65new',
 'v67new',
 'v68new',
 'v69new',
 'v70new',
 'v73new',
 'v76new',
 'v77new',
 'v78new',
 'v80new',
 'v81new',
 'v82new',
 'v83new',
 'v84new',
 'v85new',
 'v86new',
 'v87new',
 'v88new',
 'v89new',
 'v90new',
 'v92new',
 'v93new',
 'v94new',
 'v95new',
 'v96new',
 'v97new',
 'v98new',
 'v99new',
 'v100new',
 'v101new',
 'v102new',
 'v103new',
 'v104new

In [162]:
# convert the columns into a single feature vector

assembler = VectorAssembler(
    inputCols=[x for x in features],
    outputCol='features')

dat_processed = assembler.transform(dat)
test_processed = assembler.transform(test)

In [166]:
dat_processed.printSchema()

root
 |-- target: integer (nullable = true)
 |-- v1: double (nullable = true)
 |-- v2: double (nullable = true)
 |-- v4: double (nullable = true)
 |-- v5: double (nullable = true)
 |-- v6: double (nullable = true)
 |-- v7: double (nullable = true)
 |-- v8: double (nullable = true)
 |-- v9: double (nullable = true)
 |-- v10: double (nullable = true)
 |-- v11: double (nullable = true)
 |-- v12: double (nullable = true)
 |-- v13: double (nullable = true)
 |-- v14: double (nullable = true)
 |-- v15: double (nullable = true)
 |-- v16: double (nullable = true)
 |-- v17: double (nullable = true)
 |-- v18: double (nullable = true)
 |-- v19: double (nullable = true)
 |-- v20: double (nullable = true)
 |-- v21: double (nullable = true)
 |-- v23: double (nullable = true)
 |-- v25: double (nullable = true)
 |-- v26: double (nullable = true)
 |-- v27: double (nullable = true)
 |-- v28: double (nullable = true)
 |-- v29: double (nullable = true)
 |-- v32: double (nullable = true)
 |-- v33: double (n

In [13]:
# Create a logistic regression model, tree model, and a gradient boosted tree model
# Create an evaluator object

logit = LogisticRegression(featuresCol="features",labelCol="Target")
logit_model = logit.fit(train)

cart = DecisionTreeClassifier(maxDepth=10)
cart_model = cart.fit(train)

gbt = GBTClassifier(maxDepth=10, stepSize=.001,maxIter=10)
gbt_model = gbt.fit(train)

In [14]:
evaluator = BinaryClassificationEvaluator(labelCol='label')

In [15]:
pred_logit = logit_model.transform(test)
pred_cart = cart_model.transform(test)
pred_gbt = gbt_model.transform(test)

In [16]:
results = [evaluator.evaluate(i) for i in [pred_logit, pred_cart, pred_gbt]]

In [17]:
print(results)

[0.985497295510151, 0.8885200864455169, 0.8875588161461015]


In [19]:
56897/(56897+106)

0.9981404487483115

In [21]:
col_count = len(dat.columns)

assembler = VectorAssembler(
    inputCols=[x for x in features],
    outputCol='features')

dat_processed = assembler.transform(dat)
test_processed = assembler.transform(test)

normalizer = Normalizer(inputCol="features",outputCol="feature")

dat = normalizer.transform(assembler.transform(dat))

In [22]:
# Concept of pipeline
# Additionally how to cross validate
logit = LogisticRegression(featuresCol='feature',labelCol='target')
cart = DecisionTreeClassifier(featuresCol='feature',labelCol='target')
gbt = GBTClassifier(featuresCol='feature',labelCol='target')


paramGrid_logit = ParamGridBuilder() \
    .addGrid(logit.regParam, [0,0.01, 0.1]) \
    .build()
    
paramGrid_cart = ParamGridBuilder() \
    .addGrid(cart.maxDepth, [10,12,15]) \
    .build()
    
paramGrid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [10,12,15]) \
    .addGrid(gbt.stepSize,[0.01]) \
    .addGrid(gbt.maxIter,[20]) \
    .build()

evaluator = BinaryClassificationEvaluator(labelCol='target')
    

cv_logit = CrossValidator(estimator=logit,evaluator=evaluator,estimatorParamMaps=paramGrid_logit,numFolds=10)
cv_cart = CrossValidator(estimator=cart,evaluator=evaluator,estimatorParamMaps=paramGrid_cart,numFolds=10)
cv_gbt = CrossValidator(estimator=gbt,evaluator=evaluator,estimatorParamMaps=paramGrid_gbt,numFolds=10)


In [24]:
cvmodel_logit = cv_logit.fit(dat)

In [None]:
cvmodel_cart = cv_cart.fit(dat)

In [None]:
cvmodel_gbt = cv_gbt.fit(dat)

In [169]:
print(cvmodel_logit.avgMetrics)
print(cvmodel_cart.avgMetrics)
print(cvmodel_gbt.avgMetrics)

[0.7087400464393288, 0.6992343001874801, 0.7080240843897794]
[0.5036192095312046, 0.5331364209963431, 0.5430835878468885, 0.5739519392919172]
[0.7251348350063479, 0.7448645684366266, 0.7684780371882793, 0.8201436144971819]


In [43]:
np.argmax(np.array(cvmodel_gbt.avgMetrics))

3

In [None]:
rf = RandomForestClassifier(featuresCol='feature',labelCol="target")
pipeline_rf = Pipeline(stages=[assembler,normalizer, rf])

paramGrid_rf = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [15]) \
    .addGrid(rf.numTrees, [1000]) \
    .build()
    
cv_rf = CrossValidator(estimator=pipeline_rf, evaluator=evaluator, numFolds=10, estimatorParamMaps=paramGrid_rf)
cvmodel_rf = cv_rf.fit(dat)
cvmodel_rf.avgMetrics

In [58]:
dat.rdd.sampleByKey(withReplacement=False,fractions={0:0.0001, 1:1})

In [61]:
dat.rdd.take(1)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 178673.0 failed 4 times, most recent failure: Lost task 0.3 in stage 178673.0 (TID 556659, mylaptop, executor 3): java.io.IOException: Cannot run program "/home/hduser1/anaconda3/bin/python": error=2, No such file or directory
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:163)
	at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:89)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:65)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:128)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: error=2, No such file or directory
	at java.lang.UNIXProcess.forkAndExec(Native Method)
	at java.lang.UNIXProcess.<init>(UNIXProcess.java:247)
	at java.lang.ProcessImpl.start(ProcessImpl.java:134)
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
	... 14 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:446)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Cannot run program "/home/hduser1/anaconda3/bin/python": error=2, No such file or directory
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:163)
	at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:89)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:65)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:128)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
Caused by: java.io.IOException: error=2, No such file or directory
	at java.lang.UNIXProcess.forkAndExec(Native Method)
	at java.lang.UNIXProcess.<init>(UNIXProcess.java:247)
	at java.lang.ProcessImpl.start(ProcessImpl.java:134)
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
	... 14 more
