In [1]:
#importing the classes and functions
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.regression import LabeledPoint

In [2]:
#setting the data files for users and their friend list
training_data_path = 'data/poker-train.csv'
testing_data_path = 'data/poker-test.csv'
app_name = 'Poker Hand Prediction'
master = 'local'

In [3]:
#configuring the Spark and setting the master & app name
spark = SparkConf().setAppName(app_name).setMaster(master)
sc = SparkContext(conf=spark)

In [4]:
poker_train_data = sc.textFile(training_data_path).map(lambda x :[int(x) for x in x.split(',')]).map(lambda x: LabeledPoint(x[10],x[0:10]))
poker_test_data = sc.textFile(testing_data_path).map(lambda x :[int(x) for x in x.split(',')]).map(lambda x: LabeledPoint(x[10],x[0:10]))

In [5]:
#(trainingData, testData) = poker_data.randomSplit([0.7, 0.3])

In [8]:
error = []
print('Depth\tImpurity\tNum of Trees\tFeature Subset\tError')
for depth in [4,5,6,7,8]:
    for num_trees in [3,5,7,9]:
        for impurity in ['gini','entropy']:
            for feature in ['auto', 'all', 'sqrt', 'log2', 'onethird']:
                model = RandomForest.trainClassifier(poker_train_data, numClasses=10, categoricalFeaturesInfo={}, \
                                                     numTrees=num_trees, featureSubsetStrategy=feature, impurity=impurity, \
                                                     maxDepth=depth, maxBins=32)
                predictions = model.predict(poker_test_data.map(lambda x: x.features))
                labelsAndPredictions = poker_test_data.map(lambda lp: lp.label).zip(predictions)
                testErr = labelsAndPredictions.filter(
                    lambda lp: lp[0] != lp[1]).count() / float(poker_test_data.count())
                error.append(testErr)
                print(str(depth) + '\t\t'+ impurity + '\t\t' + str(num_trees) + '\t' + feature + '\t\t' + str(testErr))
            
print(min(error))

Depth	Impurity	Num of Trees	Feature Subset	Error
4		gini		3	auto		0.481091
4		gini		3	all		0.456861
4		gini		3	sqrt		0.471075
4		gini		3	log2		0.47036
4		gini		3	onethird		0.492089
4		entropy		3	auto		0.491224
4		entropy		3	all		0.495414
4		entropy		3	sqrt		0.492813
4		entropy		3	log2		0.489803
4		entropy		3	onethird		0.466883
4		gini		5	auto		0.490241
4		gini		5	all		0.464511
4		gini		5	sqrt		0.49366
4		gini		5	log2		0.471922
4		gini		5	onethird		0.486906
4		entropy		5	auto		0.463495
4		entropy		5	all		0.46492
4		entropy		5	sqrt		0.48818
4		entropy		5	log2		0.479567
4		entropy		5	onethird		0.490246
4		gini		7	auto		0.472647
4		gini		7	all		0.472338
4		gini		7	sqrt		0.487347
4		gini		7	log2		0.497582
4		gini		7	onethird		0.483122
4		entropy		7	auto		0.477007
4		entropy		7	all		0.447594
4		entropy		7	sqrt		0.462882
4		entropy		7	log2		0.489427
4		entropy		7	onethird		0.474239
4		gini		9	auto		0.48753
4		gini		9	all		0.460166
4		gini		9	sqrt		0.492729
4		gini		9	log2		0.492012
4		gini		9

In [None]:
model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo={}, numIterations=3)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())