<a href="https://colab.research.google.com/github/Benjamindavid03/MachineLearningLab/blob/main/GBT_in_pySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient Boosted Trees in pySpark

In [4]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"
!export SPARK_HOME=/content/spark-3.0.3-bin-hadoop2.7
! echo $SPARK_HOME

/content/spark-3.0.3-bin-hadoop2.7


In [6]:
import findspark
findspark.init()

In [7]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
from pandas import DataFrame, Series

In [8]:
bc = load_breast_cancer()

df_bc = DataFrame(bc.data, columns=bc.feature_names)
df_bc['label'] = Series(bc.target)
print(df_bc.head())

sc = SparkContext().getOrCreate()
sqlContext = SQLContext(sc)

data = sqlContext.createDataFrame(df_bc)
print(data.printSchema())

features = bc.feature_names

va = VectorAssembler(inputCols = features, outputCol='features')

va_df = va.transform(data)
va_df = va_df.select(['features', 'label'])
va_df.show(3)

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [9]:
# split data into train and test 
(train, test) = va_df.randomSplit([0.9, 0.1])

# training 
gbtc = GBTClassifier(labelCol="label", maxIter=10)
gbtc = gbtc.fit(train)

# prediction
pred = gbtc.transform(test)
pred.show(3)

# accucary check
evaluator=MulticlassClassificationEvaluator(metricName="accuracy")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)

# confusion matrix
y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

sc.stop() 


+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[8.888,14.64,58.7...|    1|[-1.3212085674231...|[0.06645791623833...|       1.0|
|[9.029,17.33,58.7...|    1|[-1.0936976871069...|[0.10088811290760...|       1.0|
|[9.504,12.44,60.3...|    1|[-1.3212085674231...|[0.06645791623833...|       1.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 3 rows

Prediction Accuracy:  0.96
Confusion Matrix:
[[19  2]
 [ 0 29]]
