In [1]:
#importing the classes and functions
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

In [2]:
#setting the data files for users and their friend list
file_path = 'data/poker-train.csv'
app_name = 'Poker Hand Prediction'
master = 'local'

In [3]:
#configuring the Spark and setting the master & app name
spark = SparkConf().setAppName(app_name).setMaster(master)
sc = SparkContext(conf=spark)

In [4]:
poker_data = sc.textFile(file_path).map(lambda x :[int(x) for x in x.split(',')]).map(lambda x: LabeledPoint(x[10],x[0:10]))

In [5]:
poker_data.take(10)

[LabeledPoint(9.0, [1.0,10.0,1.0,11.0,1.0,13.0,1.0,12.0,1.0,1.0]),
 LabeledPoint(9.0, [2.0,11.0,2.0,13.0,2.0,10.0,2.0,12.0,2.0,1.0]),
 LabeledPoint(9.0, [3.0,12.0,3.0,11.0,3.0,13.0,3.0,10.0,3.0,1.0]),
 LabeledPoint(9.0, [4.0,10.0,4.0,11.0,4.0,1.0,4.0,13.0,4.0,12.0]),
 LabeledPoint(9.0, [4.0,1.0,4.0,13.0,4.0,12.0,4.0,11.0,4.0,10.0]),
 LabeledPoint(8.0, [1.0,2.0,1.0,4.0,1.0,5.0,1.0,3.0,1.0,6.0]),
 LabeledPoint(8.0, [1.0,9.0,1.0,12.0,1.0,10.0,1.0,11.0,1.0,13.0]),
 LabeledPoint(8.0, [2.0,1.0,2.0,2.0,2.0,3.0,2.0,4.0,2.0,5.0]),
 LabeledPoint(8.0, [3.0,5.0,3.0,6.0,3.0,9.0,3.0,7.0,3.0,8.0]),
 LabeledPoint(8.0, [4.0,1.0,4.0,4.0,4.0,2.0,4.0,3.0,4.0,5.0])]

In [6]:
(trainingData, testData) = poker_data.randomSplit([0.7, 0.3])

In [7]:
error = 1
for impurity in ['gini','entropy']:
    for num_trees in [3,5,7,9]:
        for feature in ['auto', 'all', 'sqrt', 'log2', 'onethird']:
            print(impurity + ' ' + str(num_trees) + ' ' + feature)
            model = RandomForest.trainClassifier(trainingData, numClasses=10, categoricalFeaturesInfo={}, \
                                                 numTrees=num_trees, featureSubsetStrategy=feature, impurity=impurity, \
                                                 maxDepth=4, maxBins=32)
            predictions = model.predict(testData.map(lambda x: x.features))
            labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
            testErr = labelsAndPredictions.filter(
                lambda lp: lp[0] != lp[1]).count() / float(testData.count())
            if error > testErr:
                error = testErr
            print('Test Error = ' + str(testErr))
            print()
            
print(error)

gini 3 auto
Test Error = 0.4789209705842483

gini 3 all
Test Error = 0.4630608648502101

gini 3 sqrt
Test Error = 0.44787854141249833

gini 3 log2
Test Error = 0.46617866341331166

gini 3 onethird
Test Error = 0.4812254303917582

gini 5 auto
Test Error = 0.4875965839772265

gini 5 all
Test Error = 0.462518639013149

gini 5 sqrt
Test Error = 0.4671275586281686

gini 5 log2
Test Error = 0.49125660837738916

gini 5 onethird
Test Error = 0.4819032126880846

gini 7 auto
Test Error = 0.4673986715466992

gini 7 all
Test Error = 0.4585875016944557

gini 7 sqrt
Test Error = 0.4878676968957571

gini 7 log2
Test Error = 0.457774162938864

gini 7 onethird
Test Error = 0.46617866341331166

gini 9 auto
Test Error = 0.48922326148840994

gini 9 all
Test Error = 0.46089196150196554

gini 9 sqrt
Test Error = 0.482580994984411

gini 9 log2
Test Error = 0.486376575843839

gini 9 onethird
Test Error = 0.48393655957706383

entropy 3 auto
Test Error = 0.4671275586281686

entropy 3 all
Test Error = 0.47200759