# <center> SD701 : Kaggle Competition </center> 
<center>Arnaud Lejeune</center>

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler

## Import data

In [None]:
train_data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferSchema='true').load('/FileStore/tables/train_set-51e11.csv')

test_data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferSchema='true').load('/FileStore/tables/test_set-b5f57.csv')

display(train_data)

## Test du Decison Tree Classifier

In [None]:
# --- DecisionTreeClassifier ---

# Train a RandomForest model.

dt = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="features", maxDepth=30, impurity='entropy', maxBins = 128)

# Chain vecAssembler and classificaiton model 
pipeline_dt = Pipeline(stages=[vector_assembler, dt])

# Run stages in pipeline with the train data
model = pipeline_dt.fit(train_data)

# Make predictions on testData
predictions = model.transform(test_data)

predictions = predictions.withColumn("Cover_Type", predictions["prediction"].cast("int"))  # Cast predictions to 'int' to mach the data type expected by Kaggle
# Show the content of 'predictions'
predictions.printSchema()

## Test du Decison Tree Classifier

In [None]:
# --- RandomForestClassifier ---

# Train a RandomForest model.

rf = RandomForestClassifier(labelCol="Cover_Type", featuresCol="features", numTrees=2, maxDepth=30, maxBins = 32)
#dt = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="features")

# Chain vecAssembler and classificaiton model 
pipeline_rf = Pipeline(stages=[vector_assembler, rf])

# Run stages in pipeline with the train data
model_rf = pipeline_rf.fit(train_data)

# Make predictions on testData
predictions_rf = model_rf.transform(test_data)

predictions_rf = predictions_rf.withColumn("Cover_Type", predictions_rf["prediction"].cast("int"))  # Cast predictions to 'int' to mach the data type expected by Kaggle
# Show the content of 'predictions'
predictions_rf.printSchema()

<center>
Après le test de ces 2 modèles, d'autres modèles ont également été testés et plusieurs paramètres testés pour mieux comprendre ces classieurs
</center>
<br>
<center>
Une fois ces tests effectués, j'ai entreopris d'utilser une cross-validation pour tester les différents paramètres identifiés au précédemment
</center>

## Random Forest Classifier - Cross-Validation

In [None]:
# Cross Validation on RandomForest

vector_assembler = VectorAssembler(inputCols=["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"], outputCol="features")

rf = RandomForestClassifier(labelCol="Cover_Type", featuresCol="features")

pipeline = Pipeline(stages=[vector_assembler, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .addGrid(rf.maxBins, [4, 8, 16]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol = "Cover_Type", predictionCol = "prediction")

crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=10)

model = crossval.fit(train_data)

## Decision Tree Classifier - Cross-Validation

In [None]:
vector_assembler = VectorAssembler(inputCols=["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"], outputCol="features")

dt = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="features")

pipeline = Pipeline(stages=[vector_assembler, dt])

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [30]) \
    .addGrid(dt.maxBins, [32, 64, 128]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol = "Cover_Type", predictionCol = "prediction")

crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=10)

model = crossval.fit(train_data)

<center>
Après test de plusieurs jeux de paramètres sur ces 2 modèles, c'est le <strong>DecisionTree Classifier</strong> qui a donné le meilleur score sur le Kaggle, avec les paramètres testés ci-dessus
</center>
<br>
<center>
<strong>Feature Engineering : </strong>Par manque de temps, je n'ai pas pu travailler sur les features et essayer de modifier les variables explicative. Cela aurait pu être une piste d'amélioration
</center>