In [15]:
#importing the classes and functions
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

In [16]:
#setting the data files for users and their friend list
training_data_path = 'data/poker-train.csv'
testing_data_path = 'data/poker-test.csv'
app_name = 'Poker Hand Prediction'
master = 'local'

In [None]:
#configuring the Spark and setting the master & app name
spark = SparkConf().setAppName(app_name).setMaster(master)
sc = SparkContext(conf=spark)

In [18]:
poker_train_data = sc.textFile(training_data_path).map(lambda x :[int(x) for x in x.split(',')]).map(lambda x: LabeledPoint(x[10],x[0:10]))
poker_test_data = sc.textFile(testing_data_path).map(lambda x :[int(x) for x in x.split(',')]).map(lambda x: LabeledPoint(x[10],x[0:10]))

In [None]:
error = []
with open('Random Forest.txt', 'w+') as f:
    f.write('Depth\tImpurity\tNum of Trees\tFeature Subset\tError')
    for depth in range(4,10):
        for num_trees in range(4,10,2):
            for impurity in ['gini','entropy']:
                for feature in ['auto', 'all', 'sqrt', 'log2', 'onethird']:
                    model = RandomForest.trainClassifier(poker_train_data, \
                                                         numClasses=10, \
                                                         categoricalFeaturesInfo={}, \
                                                         numTrees=num_trees, \
                                                         featureSubsetStrategy=feature, \
                                                         impurity=impurity, \
                                                         maxDepth=depth, \
                                                         maxBins=32)
                    predictions = model.predict(poker_test_data.map(lambda x: x.features))
                    labelsAndPredictions = poker_test_data.map(lambda lp: lp.label).zip(predictions)
                    testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count()/float(poker_test_data.count())
                    error.append(testErr)
                    f.write(str(depth) + '\t'+ impurity + '\t\t' + str(num_trees) \
                            + '\t\t' + feature + '\t\t' + str(testErr))
print(min(error))

In [None]:
error = []
with open('Gradient Boosting.txt', 'w+') as f:
    f.write('Iterations\Rate\tDepth\tLoss\tError')
    for iters in range(4,10):
        for rate in range(0.1,1,0.1):
            for depth in range(4,10):
                for loss in ['logLoss', 'leastSquaresError', 'leastAbsoluteError']:
                    model = GradientBoostedTrees.trainClassifier(poker_train_data, \
                                                                 categoricalFeaturesInfo={}, \
                                                                 loss=loss, \
                                                                 numIterations=iters, \
                                                                 learningRate=rate, \
                                                                 maxDepth=depth)
                    predictions = model.predict(poker_test_data.map(lambda x: x.features))
                    labelsAndPredictions = poker_test_data.map(lambda lp: lp.label).zip(predictions)
                    testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count()/float(poker_test_data.count())
                    error.append(testErr)
                    f.write(str(iters) + '\t'+ str(rate) + '\t\t' + str(depth) \
                            + '\t\t' + loss + '\t\t' + str(testErr))
print(min(error))