In [1]:
import numpy as np
import pandas as pd

AP_data = pd.DataFrame(columns=["match_id","hero_id","player_slot","radiant_win_res"])
#113 heros
Match_data_columns = range(1,114)
Match_data = pd.DataFrame(columns=Match_data_columns)

print(Match_data)

In [2]:
%fs ls /FileStore/tables/

In [3]:
df_data = pd.read_csv("/dbfs/FileStore/tables/match_hero_pick_50000.csv", header=0)
print(df_data)

In [4]:
count = 1
Match_Result = pd.DataFrame(columns=["radiant_win_res"])
for index, row in df_data.iterrows():
  match_id = row['match_id']
  hero_id = row['hero_id']
  player_slot = row['player_slot']
  radiant_win = row['radiant_win']
  if radiant_win == True:
      radiant_win_res = True
  else:
      radiant_win_res = False
  Match_Result.loc[match_id] = radiant_win_res
  if count % 10 == 1:
      Match_data.loc[match_id] = np.zeros(113)
  if count % 10 < 6:
      Match_data.loc[match_id][hero_id] = 1
  else:
      Match_data.loc[match_id][hero_id] = -1
  #Match_data.loc[match_id]['label'] = radiant_win_res
  count += 1

In [5]:
Match_data['radiant_win'] = Match_Result.astype(int)

In [6]:
Match_data.describe()

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = []
#categoricalColumns = map(str, range(1,227))
#categoricalColumns.remove('24')
#categoricalColumns.remove('108')
#categoricalColumns.remove('113')
#categoricalColumns.remove('137')
#categoricalColumns.remove('221')
#categoricalColumns.remove('226')
#print(categoricalColumns)

stages = []
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

In [8]:
label_stringIdx = StringIndexer(inputCol = "radiant_win", outputCol = "label")
stages += [label_stringIdx]

In [9]:
numericCols = map(str, range(1,114))
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [10]:
spark_df = sqlContext.createDataFrame(Match_data)

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(spark_df)
dataset = pipelineModel.transform(spark_df)

display(dataset)

In [11]:
(trainingData, testData) = dataset.randomSplit([0.9, 0.1], seed = 333)

In [12]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [13]:
predictions = lrModel.transform(testData)

In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [15]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.5, 1])
             .addGrid(lr.elasticNetParam, [0, 0.5, 1])
             .addGrid(lr.maxIter, [5])
             .build())

In [16]:
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)
cvModel = cv.fit(trainingData)

In [17]:
predictions = cvModel.transform(testData)
evaluator.evaluate(predictions)

In [18]:
from pyspark.ml.regression import GBTRegressor
# Takes the "features" column and learns to predict "cnt"
gbt = GBTRegressor(labelCol="radiant_win")

In [19]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = map(str, range(1,114))
#featuresCols.remove('cnt')
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures")
# This identifies categorical features and indexes them.
vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=10)

In [20]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
#  - maxIter: iterations, i.e., number of trees in each GBT ensemble
# In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2, 5])\
  .addGrid(gbt.maxIter, [10, 100])\
  .build()
# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol())
# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid)

In [21]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])
(trainingData_DT, testData_DT) = spark_df.randomSplit([0.9, 0.1], seed = 333)

In [22]:
pipelineModel = pipeline.fit(trainingData_DT)
predictions = pipelineModel.transform(testData_DT)

In [23]:
rmse = evaluator.evaluate(predictions)
print "RMSE on our test set: %g" % rmse

In [24]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [26]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

In [27]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(trainingData)

In [28]:
predictions = cvModel.transform(testData)
evaluator.evaluate(predictions)