# **Big Data project: using Spark ML** 

## **Import necessary libraries**

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 39 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 46.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805911 sha256=f896320c17ad86184e8351abc92785a406ccbeb6cf83ea320cb63bbba8d1ab42
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
import pyspark

# Check for the version of pyspark
print(pyspark.__version__)

3.2.0


## **Load the data**

In [None]:
from pyspark.sql import SparkSession
# Start the session
spark = SparkSession.builder.appName('Bank Churn Classification').getOrCreate()

In [None]:
spark

In [None]:
path = "/content/drive/MyDrive/ML Projects/Churn Modelling/Churn_Modelling.csv"

# Load the data
df = spark.read.format("csv").option('header', 'true').load(path)

In [None]:
# Show the first rows of the data
df.show(5)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|        0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|        0|            2|        0|             0|       93826.63|

## **Exploratory Ddata Analysis (EDA)**

In [None]:
# Get statistical description of the data as pandas daraframe 
df.describe().toPandas()

Unnamed: 0,summary,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,count,10000.0,10000.0,10000,10000.0,10000,10000,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
1,mean,5000.5,15690940.5694,,650.5288,,,38.9218,5.0128,76485.88928799961,1.5302,0.7055,0.5151,100090.2398809998,0.2037
2,stddev,2886.8956799071675,71936.18612274907,,96.65329873613037,,,10.487806451704587,2.892174377049684,62397.40520238599,0.5816543579989917,0.4558404644751332,0.4997969284589181,57510.49281769821,0.4027685839948606
3,min,1.0,15565701.0,Abazu,350.0,France,Female,18.0,0.0,0.0,1.0,0.0,0.0,100015.79,0.0
4,max,9999.0,15815690.0,Zuyeva,850.0,Spain,Male,92.0,9.0,99986.98,4.0,1.0,1.0,99984.86,1.0


In [None]:
# Get data type for each feature
df.dtypes

[('RowNumber', 'string'),
 ('CustomerId', 'string'),
 ('Surname', 'string'),
 ('CreditScore', 'string'),
 ('Geography', 'string'),
 ('Gender', 'string'),
 ('Age', 'string'),
 ('Tenure', 'string'),
 ('Balance', 'string'),
 ('NumOfProducts', 'string'),
 ('HasCrCard', 'string'),
 ('IsActiveMember', 'string'),
 ('EstimatedSalary', 'string'),
 ('Exited', 'string')]

We need to:
- Drop RowNumber, CustomerId, and Surname.
- Convert CreditScore, Age, Tenure, NumOfProducts, HasCrCard, IsActiveMember, Exited from string to integer.

- Convert EstimatedSalary and Balance from string to float.

In [None]:
# 1. Drop RowNumber, CustomerId, and Surname.
df = df.drop('RowNumber', 'CustomerId', 'Surname')

df.show(5)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|        0|            1|        1|             1|      101348.88|     1|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        699|   France|Female| 39|     1|        0|            2|        0|             0|       93826.63|     0|
|        850|    Spain|Female| 43|     2|125510.82|            1|        1|             1|        79084.1|     0|
+-----------+---------+------+---+------+---------+-------------+---------+-------------

In [None]:
# 2. Convert CreditScore, Age, Tenure, NumOfProducts, HasCrCard, IsActiveMember, Exited from string to integer.
# 3. Convert HasCrCard, IsActiveMember, and Exited from string to integer

from pyspark.sql.functions import col
dataset = df.select(col('CreditScore').cast('float'), 
                    col('Geography'),
                    col('Gender'),
                    col('Age').cast('int'),
                    col('Tenure').cast('int'),
                    col('Balance').cast('float'), 
                    col('NumOfProducts').cast('int'),
                    col('HasCrCard').cast('int'),
                    col('IsActiveMember').cast('int'),
                    col('EstimatedSalary').cast('float'), 
                    col('Exited').cast('int')
                    )

dataset.show(5)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|      619.0|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|      608.0|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|      502.0|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|      699.0|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|
|      850.0|    Spain|Female| 43|     2|125510.82|            1|        1|             1|        79084.1|     0|
+-----------+---------+------+---+------+---------+-------------+---------+-------------

In [None]:
# Check for data type
dataset.dtypes

[('CreditScore', 'float'),
 ('Geography', 'string'),
 ('Gender', 'string'),
 ('Age', 'int'),
 ('Tenure', 'int'),
 ('Balance', 'float'),
 ('NumOfProducts', 'int'),
 ('HasCrCard', 'int'),
 ('IsActiveMember', 'int'),
 ('EstimatedSalary', 'float'),
 ('Exited', 'int')]

In [None]:
# Check if there is any missing values

from pyspark.sql.functions import isnull, when, count, col

dataset.select([count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show()

+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|          0|        0|     0|  0|     0|      0|            0|        0|             0|              0|     0|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+



**We don't have any missing values in the dataset.**

We need to:
- Index the categorical columns (Geography and Gender).

In [None]:
# Index categorical columns with StringIndexer
from pyspark.ml.feature import StringIndexer

# Transform Geography column
indexor = StringIndexer(inputCol='Geography', outputCol='Country', handleInvalid='keep')
dataset = indexor.fit(dataset).transform(dataset)

# Transform Gender column
indexor = StringIndexer(inputCol='Gender', outputCol='Sex', handleInvalid='keep')
dataset = indexor.fit(dataset).transform(dataset)
dataset.show()

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Country|Sex|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
|      619.0|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|    0.0|1.0|
|      608.0|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|    2.0|1.0|
|      502.0|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|    0.0|1.0|
|      699.0|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|    0.0|1.0|
|      850.0|    Spain|Female| 43|     2|125510.82|            1|        1|             1|        79084.1|     0|    2

**Country:**
- France: 0
- Germany: 1
- Spain: 2

**Sex:**
- Male: 0
- Female: 1

In [None]:
# Drop Geography and Gender columns
dataset = dataset.drop('Geography', 'Gender')

In [None]:
dataset.show(5)

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
|CreditScore|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Country|Sex|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
|      619.0| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|    0.0|1.0|
|      608.0| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|    2.0|1.0|
|      502.0| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|    0.0|1.0|
|      699.0| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|    0.0|1.0|
|      850.0| 43|     2|125510.82|            1|        1|             1|        79084.1|     0|    2.0|1.0|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
only showing top 5 

# **Full Implementation**

In [None]:
# Assemble all the features with VectorAssembler

required_features = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", 
                     "HasCrCard", "IsActiveMember", "EstimatedSalary", "Country", "Sex"]

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=required_features, outputCol='features')

transformed_data = assembler.transform(dataset)

In [None]:
transformed_data.show()

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+--------------------+
|CreditScore|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Country|Sex|            features|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+--------------------+
|      619.0| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|    0.0|1.0|[619.0,42.0,2.0,0...|
|      608.0| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|    2.0|1.0|[608.0,41.0,1.0,8...|
|      502.0| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|    0.0|1.0|[502.0,42.0,8.0,1...|
|      699.0| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|    0.0|1.0|[699.0,39.0,1.0,0...|
|      850.0| 43|     2|125510.82|            1|        1|             1|        79084.1| 

In [None]:
# Split the data
(training_data, test_data) = transformed_data.randomSplit([0.8, 0.2])

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [None]:
def CV4BestModel(model, params, train):
    ## CV with 5 folds
    crossval = CrossValidator(estimator = model,
                              estimatorParamMaps = params,
                              evaluator = MulticlassClassificationEvaluator(labelCol='Exited', predictionCol='prediction'), 
                              numFolds = 5)

    ## Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)  

    return cvModel.bestModel

In [None]:
def make_pred(model, test):
    return model.transform(test)

In [None]:
def evaluation(test, preds):
    # For the AUC, we need the Binary Evaluator
    binEvaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Exited')
    auc = binEvaluator.evaluate(preds, {binEvaluator.metricName: "areaUnderROC"})

    # For the F1 and Accuracy, we need the MultiClass Evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol='Exited', predictionCol='prediction')
    f1 = evaluator.setMetricName("f1").evaluate(preds)
    acc = evaluator.setMetricName("accuracy").evaluate(preds)
    
    return acc, f1, auc

In [None]:
def DoALL(model, train, test, params):
    print(f"### ----------- {model} ----------- ###")
    # Get the best model
    best = CV4BestModel(model, params, train)

    # Make predictions
    pred = make_pred(best, test)

    # Evaluate the model
    acc, f1, auc = evaluation(test, pred)

    print("\nACCURACY: ", acc)
    print("\nF1-Score: ", f1)
    print("\nAUC: ", auc)
    print("\n### ############# DONE ! ############# ###\n")

## **Without Data scaling**

### **Logistic Regression**

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='Exited', featuresCol='features', maxIter=500)
lr_params = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.5, 0.01]).addGrid(lr.elasticNetParam, [0.0, 1.0]).build()

In [None]:
DoALL(lr, training_data, test_data, lr_params)

### ----------- LogisticRegression_4af94b67d91b ----------- ###

ACCURACY:  0.8058058058058059

F1-Score:  0.7550955025086846

AUC:  0.7458885071375282

### ############# DONE ! ############# ###



### **Decision Tree**

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
DT = DecisionTreeClassifier(labelCol='Exited', featuresCol='features')
DT_params = ParamGridBuilder().addGrid(DT.maxDepth, [3, 5, 7, 10]).addGrid(DT.impurity, ['gini', 'entropy']).build()

In [None]:
DoALL(DT, training_data, test_data, DT_params)

### ----------- DecisionTreeClassifier_bfef9f4ffcea ----------- ###

ACCURACY:  0.8593593593593594

F1-Score:  0.8447150037238628

AUC:  0.4981518021270757

### ############# DONE ! ############# ###



### **Random Forest**

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='Exited', featuresCol='features')
rf_params = ParamGridBuilder().addGrid(rf.maxDepth, [3, 5, 7, 10]).addGrid(rf.impurity, ['gini', 'entropy']) \
.addGrid(rf.numTrees, [50, 100, 150]).addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt']).build()

In [None]:
DoALL(rf, training_data, test_data, rf_params)

### ----------- RandomForestClassifier_035e6783a438 ----------- ###

ACCURACY:  0.8578578578578578

F1-Score:  0.8429401199420573

AUC:  0.8493906542973106

### ############# DONE ! ############# ###



### **Gradient Boosting**

In [None]:
from pyspark.ml.classification import GBTClassifier
gb = GBTClassifier(labelCol='Exited', featuresCol='features')
gb_params = ParamGridBuilder().addGrid(gb.maxDepth, [3, 5, 7, 10]).addGrid(gb.stepSize, [0.1, 0.7]) \
.addGrid(gb.featureSubsetStrategy, ['log2', 'sqrt']).build()

In [None]:
DoALL(gb, training_data, test_data, gb_params)

### ----------- GBTClassifier_59e0638226e0 ----------- ###

ACCURACY:  0.8568568568568569

F1-Score:  0.8451882879433641

AUC:  0.8534695679770479

### ############# DONE ! ############# ###



### **SVC**

In [None]:
from pyspark.ml.classification import LinearSVC
svc = LinearSVC(labelCol='Exited', featuresCol='features', maxIter = 500)
svc_params = ParamGridBuilder().addGrid(svc.regParam, [0.0, 0.5, 0.01, 0.1]).addGrid(svc.threshold, [0.5, 0.7, 0.8]).build()

In [None]:
DoALL(svc, training_data, test_data, svc_params)

### ----------- LinearSVC_595e2af5b197 ----------- ###

ACCURACY:  0.7942942942942943

F1-Score:  0.7032329400530237

AUC:  0.7197163081423456

### ############# DONE ! ############# ###



### **Naive Bayes**

In [None]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol='Exited', featuresCol='features', modelType="multinomial")
nb_params = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.1, 0.3, 1.0]).build()

In [None]:
DoALL(nb, training_data, test_data, nb_params)

### ----------- NaiveBayes_8b15da0796ad ----------- ###

ACCURACY:  0.541041041041041

F1-Score:  0.5843251862069033

AUC:  0.43790101141114635

### ############# DONE ! ############# ###



## **With Data scaling**

### **Prepare the data**

In [None]:
print("Before Scaling:")
dataset.show(5)

Before Scaling:
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
|CreditScore|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Country|Sex|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
|      619.0| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|    0.0|1.0|
|      608.0| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|    2.0|1.0|
|      502.0| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|    0.0|1.0|
|      699.0| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|    0.0|1.0|
|      850.0| 43|     2|125510.82|            1|        1|             1|        79084.1|     0|    2.0|1.0|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+
onl

In [None]:
# Scale the dataframe
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# UDF for converting column type from vector to double type
unlist = udf(lambda x: round(float(list(x)[0]), 3), DoubleType())

# Iterating over columns to be scaled
for col in ["Balance", "EstimatedSalary"]:
    # VectorAssembler Transformation - Converting column to vector type
    assembler = VectorAssembler(inputCols = [col], outputCol = col + "_Vect")

    # MinMaxScaler Transformation
    scaler = MinMaxScaler(inputCol = col + "_Vect", outputCol = col + "_Scaled")

    # Pipeline of VectorAssembler and MinMaxScaler
    pipeline = Pipeline(stages = [assembler, scaler])

    # Fitting pipeline on dataframe
    dataset = pipeline.fit(dataset).transform(dataset).withColumn( col + "_Scaled", unlist(col + "_Scaled")).drop(col + "_Vect")

print("After Scaling:")
dataset.show(5)

After Scaling:
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+--------------+----------------------+
|CreditScore|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Country|Sex|Balance_Scaled|EstimatedSalary_Scaled|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+--------------+----------------------+
|      619.0| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|    0.0|1.0|           0.0|                 0.507|
|      608.0| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|    2.0|1.0|         0.334|                 0.563|
|      502.0| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|    0.0|1.0|         0.636|                  0.57|
|      699.0| 39|     1|      0.0|            2|        0|             0|       93826.63|     0|    0.0

In [None]:
# Assemble all the features with VectorAssembler

required_features = ["CreditScore", "Age", "Tenure", "Balance_Scaled", "NumOfProducts", 
                     "HasCrCard", "IsActiveMember", "EstimatedSalary_Scaled", "Country", "Sex"]

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='Features')

scaled_data = assembler.transform(dataset)

In [None]:
scaled_data.show()

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+--------------+----------------------+--------------------+
|CreditScore|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Country|Sex|Balance_Scaled|EstimatedSalary_Scaled|            Features|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+-------+---+--------------+----------------------+--------------------+
|      619.0| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|    0.0|1.0|           0.0|                 0.507|[619.0,42.0,2.0,0...|
|      608.0| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|    2.0|1.0|         0.334|                 0.563|[608.0,41.0,1.0,0...|
|      502.0| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|    0.0|1.0|         0.636|                  0.57|[502.0,42.0,8.

In [None]:
# Split the data
training_data, test_data = scaled_data.randomSplit([0.8, 0.2])

### **Modelling**

#### **Logistic Regression**

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='Exited', featuresCol='Features', maxIter=500)
lr_params = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.5, 0.01]).addGrid(lr.elasticNetParam, [0.0, 1.0]).build()

In [None]:
DoALL(lr, training_data, test_data, lr_params)

### ----------- LogisticRegression_d9b9632b0ea9 ----------- ###

ACCURACY:  0.8087431693989071

F1-Score:  0.7604722146670073

AUC:  0.7559753710525088

### ############# DONE ! ############# ###



#### **Decision Tree**

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
DT = DecisionTreeClassifier(labelCol='Exited', featuresCol='Features')
DT_params = ParamGridBuilder().addGrid(DT.maxDepth, [3, 5, 7, 10]).addGrid(DT.impurity, ['gini', 'entropy']).build()

In [None]:
DoALL(DT, training_data, test_data, DT_params)

### ----------- DecisionTreeClassifier_7d15986fdbcf ----------- ###

ACCURACY:  0.8455042225534029

F1-Score:  0.8258389342574138

AUC:  0.5439914923785892

### ############# DONE ! ############# ###



#### **Random Forest**

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='Exited', featuresCol='Features')
rf_params = ParamGridBuilder().addGrid(rf.maxDepth, [3, 5, 7, 10]).addGrid(rf.impurity, ['gini', 'entropy']) \
.addGrid(rf.numTrees, [50, 100, 150]).addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt']).build()

In [None]:
DoALL(rf, training_data, test_data, rf_params)

### ----------- RandomForestClassifier_faa00a4f817e ----------- ###

ACCURACY:  0.8544461003477397

F1-Score:  0.8350999344338438

AUC:  0.8483231354900334

### ############# DONE ! ############# ###



#### **Gradient Boosting**

In [None]:
from pyspark.ml.classification import GBTClassifier
gb = GBTClassifier(labelCol='Exited', featuresCol='Features')
gb_params = ParamGridBuilder().addGrid(gb.maxDepth, [3, 5, 7, 10]).addGrid(gb.stepSize, [0.1, 0.7]) \
.addGrid(gb.featureSubsetStrategy, ['log2', 'sqrt']).build()

In [None]:
DoALL(gb, training_data, test_data, gb_params)

### ----------- GBTClassifier_9abdcb6334d9 ----------- ###

ACCURACY:  0.8574267262791853

F1-Score:  0.845302418042258

AUC:  0.8542229859901651

### ############# DONE ! ############# ###



#### **SVC**

In [None]:
from pyspark.ml.classification import LinearSVC
svc = LinearSVC(labelCol='Exited', featuresCol='Features', maxIter = 500)
svc_params = ParamGridBuilder().addGrid(svc.regParam, [0.0, 0.5, 0.01, 0.1]).addGrid(svc.threshold, [0.5, 0.7, 0.8]).build()

In [None]:
DoALL(svc, training_data, test_data, svc_params)

### ----------- LinearSVC_aa53ae315497 ----------- ###

ACCURACY:  0.7998012916045703

F1-Score:  0.7108363673659167

AUC:  0.6848041859963327

### ############# DONE ! ############# ###



#### **Naive Bayes**

In [None]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol='Exited', featuresCol='Features', modelType="multinomial")
nb_params = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.1, 0.3, 1.0]).build()

In [None]:
DoALL(nb, training_data, test_data, nb_params)

### ----------- NaiveBayes_b20f0eed52ba ----------- ###

ACCURACY:  0.7575757575757576

F1-Score:  0.7545062454489442

AUC:  0.34309372254673814

### ############# DONE ! ############# ###

