In [1]:
%fs ls /FileStore/tables/

In [2]:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *

#inputPath = "/FileStore/tables/match_info_5000_only_es.csv"
inputPath = "/FileStore/tables/match_info_5000_es.csv"

Schema = StructType([StructField("match_id", IntegerType()), 
                     StructField("hero_id", IntegerType()), 
                     StructField("gold_per_min", IntegerType()),
                     StructField("xp_per_min", IntegerType()),
                     StructField("hero_damage", IntegerType()),
                     StructField("kills", IntegerType()),
                     StructField("assists", IntegerType()),
                     StructField("radiant_win", StringType()),])

#staticInputDF = (  
#  spark.read
#    .option("header", "true")
#    .option("inferSchema", "true")
#    .option("delimiter", ",")
#    .schema(Schema)
#    .json(inputPath)
#)
df = spark.read.csv(inputPath, header="true", schema=Schema)
df.printSchema()


In [3]:
display(df.select("*"))

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [5]:
categoricalColumns = ["hero_id"]
stages = []

for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

In [6]:
label_stringIdx = StringIndexer(inputCol = "radiant_win", outputCol = "label")
stages += [label_stringIdx]

In [7]:
# Transform all features into a vector using VectorAssembler
numericCols = ["gold_per_min","xp_per_min","hero_damage","kills","assists"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [8]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
dataset = pipelineModel.transform(df)

dataset.show()

In [9]:
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 999)

In [10]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [11]:
predictions = lrModel.transform(testData)
predictions.printSchema()

In [12]:
selected = predictions.select("label", "prediction", "probability", "hero_id", "gold_per_min", "xp_per_min", "hero_damage")
display(selected)

In [13]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [14]:
print lr.explainParams()

In [15]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [16]:
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

In [17]:
predictions = cvModel.transform(testData)
evaluator.evaluate(predictions)

In [18]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.printSchema()

In [19]:
selected = predictions.select("label", "prediction", "probability", "hero_id", "gold_per_min", "xp_per_min", "hero_damage")
display(selected)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [21]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

In [22]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(trainingData)

In [23]:
predictions = cvModel.transform(testData)
evaluator.evaluate(predictions)

In [24]:
selected = predictions.select("label", "prediction", "probability", "hero_id", "gold_per_min", "xp_per_min", "hero_damage")
display(selected)