# Default Parameters

In [1]:
filename = "bank-full.csv"
target_variable_name = "y"

# Load Dataset

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

# Length of the data

In [3]:
data.count()

45211

# Describe data

In [4]:
data.describe().toPandas()

Unnamed: 0,summary,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
1,mean,40.93621021432837,,,,,1362.2720576850766,,,,15.80641879188693,,258.1630797814691,2.763840658246887,40.19782796222158,0.5803233726305546,,
2,stddev,10.6187620409754,,,,,3044.7658291685243,,,,8.322476153044589,,257.5278122651712,3.0980208832791813,100.12874599059818,2.3034410449312164,,
3,min,18.0,admin.,divorced,primary,no,-8019.0,no,no,cellular,1.0,apr,0.0,1.0,-1.0,0.0,failure,no
4,max,95.0,unknown,single,unknown,yes,102127.0,yes,yes,unknown,31.0,sep,4918.0,63.0,871.0,275.0,unknown,yes


# Check Data types of each column

In [129]:
data.groupby('marital').count().show()

+-------+-----+
|marital|count|
+-------+-----+
|    0.0|27214|
|    1.0|12790|
|    2.0| 5207|
+-------+-----+



In [5]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [6]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [7]:
from pyspark.sql.functions import * 
data.groupBy(target_variable_name).agg({'balance':'avg', 'age': 'avg'}).show()

+---+------------------+------------------+
|  y|      avg(balance)|          avg(age)|
+---+------------------+------------------+
| no|1303.7149691899203| 40.83898602274435|
|yes|1804.2679145396105|41.670069956513515|
+---+------------------+------------------+



# Cardinality Check

In [8]:
from pyspark.sql.functions import approxCountDistinct, countDistinct

"""
Note: approxCountDistinct and countDistinct can be used interchangeably. Only difference is the computation time. 

"approxCountDistinct" is useful for large datasets 
"countDistinct" for small and medium datasets.

"""

def cardinality_calculation(df, cut_off=1):
    cardinality = df.select(*[approxCountDistinct(c).alias(c) for c in df.columns])
    
    ## convert to pandas for efficient calculations
    final_cardinality_df = cardinality.toPandas().transpose()
    final_cardinality_df.reset_index(inplace=True) 
    final_cardinality_df.rename(columns={0:'Cardinality'}, inplace=True) 
    
    #select variables with cardinality of 1
    vars_selected = final_cardinality_df['index'][final_cardinality_df['Cardinality'] <= cut_off] 
    
    return final_cardinality_df, vars_selected

cardinality_df, cardinality_vars_selected = cardinality_calculation(data)

In [9]:
cardinality_df

Unnamed: 0,index,Cardinality
0,age,76
1,job,11
2,marital,3
3,education,4
4,default,2
5,balance,7375
6,housing,2
7,loan,2
8,contact,3
9,day,32


In [10]:
cardinality_vars_selected

Series([], Name: index, dtype: object)

# Missing value check

In [11]:
#missing values check
from pyspark.sql.functions import count, when, isnan, col

# miss_percentage is set to 80% as discussed in the book
def missing_calculation(df, miss_percentage=0.80):
    
    #checks for both NaN and null values
    missing = df.select(*[count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])
    length_df = df.count()
    ## convert to pandas for efficient calculations
    final_missing_df = missing.toPandas().transpose()
    final_missing_df.reset_index(inplace=True) 
    final_missing_df.rename(columns={0:'missing_count'}, inplace=True) 
    final_missing_df['missing_percentage'] = final_missing_df['missing_count']/length_df
    
    #select variables with cardinality of 1
    vars_selected = final_missing_df['index'][final_missing_df['missing_percentage'] >= miss_percentage] 
    
    return final_missing_df, vars_selected

In [12]:
missing_df, missing_vars_selected = missing_calculation(data)

In [13]:
missing_df

Unnamed: 0,index,missing_count,missing_percentage
0,age,0,0.0
1,job,0,0.0
2,marital,0,0.0
3,education,0,0.0
4,default,0,0.0
5,balance,0,0.0
6,housing,0,0.0
7,loan,0,0.0
8,contact,0,0.0
9,day,0,0.0


In [14]:
missing_vars_selected

Series([], Name: index, dtype: object)

# Identify variable types

In [15]:
def variable_type(df):
    
    vars_list = df.dtypes
    char_vars = []
    num_vars = []
    for i in vars_list:
        if i[1] in ('string'):
            char_vars.append(i[0])
        else:
            num_vars.append(i[0])
    
    return char_vars, num_vars

In [16]:
char_vars, num_vars = variable_type(data)

In [17]:
char_vars

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [18]:
num_vars

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [19]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

def category_to_index(df, char_vars):
    
    char_df = df.select(char_vars)
    indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in char_df.columns]
    pipeline = Pipeline(stages=indexers)
    char_labels = pipeline.fit(char_df)
    df = char_labels.transform(df)
    return df, char_labels

In [20]:
data, char_labels = category_to_index(data, char_vars)

In [21]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string'),
 ('job_index', 'double'),
 ('marital_index', 'double'),
 ('education_index', 'double'),
 ('default_index', 'double'),
 ('housing_index', 'double'),
 ('loan_index', 'double'),
 ('contact_index', 'double'),
 ('month_index', 'double'),
 ('poutcome_index', 'double'),
 ('y_index', 'double')]

In [22]:
data = data.select([c for c in data.columns if c not in char_vars])

In [23]:
def rename_columns(df, char_vars):
    mapping = dict(zip([i + '_index' for i in char_vars], char_vars))
    df = df.select([col(c).alias(mapping.get(c, c)) for c in df.columns])
    return df

In [24]:
data = rename_columns(data, char_vars)

In [25]:
data.dtypes

[('age', 'int'),
 ('balance', 'int'),
 ('day', 'int'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('job', 'double'),
 ('marital', 'double'),
 ('education', 'double'),
 ('default', 'double'),
 ('housing', 'double'),
 ('loan', 'double'),
 ('contact', 'double'),
 ('month', 'double'),
 ('poutcome', 'double'),
 ('y', 'double')]

In [26]:
data.groupBy('y').count().show() 

+---+-----+
|  y|count|
+---+-----+
|0.0|39922|
|1.0| 5289|
+---+-----+



In [27]:
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])

In [28]:
linear_df

DataFrame[age: int, balance: int, day: int, duration: int, campaign: int, pdays: int, previous: int]

In [29]:
target_variable_name = 'balance'

# Assemble input vectors

In [30]:
from pyspark.ml.feature import VectorAssembler

#assemble individual columns to one column - 'features'
def assemble_vectors(df, features_list, target_variable_name):
    stages = []
    #assemble vectors
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    stages = [assembler]
    #select all the columns + target + newly created 'features' column
    selectedCols = [target_variable_name, 'features'] + features_list
    #use pipeline to process sequentially
    pipeline = Pipeline(stages=stages)
    #assembler model
    assembleModel = pipeline.fit(df)
    #apply assembler model on data
    df = assembleModel.transform(df).select(selectedCols)

    return df

In [31]:
#exclude target variable and select all other feature vectors
features_list = linear_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)

In [32]:
features_list

['age', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [33]:
# apply the function on our dataframe
df = assemble_vectors(linear_df, features_list, target_variable_name)

In [34]:
df.show()

+-------+--------------------+---+---+--------+--------+-----+--------+
|balance|            features|age|day|duration|campaign|pdays|previous|
+-------+--------------------+---+---+--------+--------+-----+--------+
|   2143|[58.0,5.0,261.0,1...| 58|  5|     261|       1|   -1|       0|
|     29|[44.0,5.0,151.0,1...| 44|  5|     151|       1|   -1|       0|
|      2|[33.0,5.0,76.0,1....| 33|  5|      76|       1|   -1|       0|
|   1506|[47.0,5.0,92.0,1....| 47|  5|      92|       1|   -1|       0|
|      1|[33.0,5.0,198.0,1...| 33|  5|     198|       1|   -1|       0|
|    231|[35.0,5.0,139.0,1...| 35|  5|     139|       1|   -1|       0|
|    447|[28.0,5.0,217.0,1...| 28|  5|     217|       1|   -1|       0|
|      2|[42.0,5.0,380.0,1...| 42|  5|     380|       1|   -1|       0|
|    121|[58.0,5.0,50.0,1....| 58|  5|      50|       1|   -1|       0|
|    593|[43.0,5.0,55.0,1....| 43|  5|      55|       1|   -1|       0|
|    270|[41.0,5.0,222.0,1...| 41|  5|     222|       1|   -1|  

# Linear Regression

In [35]:
from pyspark.ml.regression import LinearRegression
reg = LinearRegression(featuresCol='features', labelCol='balance')
reg_model = reg.fit(df) # fit model

In [36]:
import pandas as pd
for k, v in df.schema["features"].metadata["ml_attr"]["attrs"].items():
    features_df = pd.DataFrame(v)

# print coefficient and intercept
print(reg_model.coefficients, reg_model.intercept)

features_df['coefficients'] = reg_model.coefficients

[28.08397290892997,3.3055463619496286,0.24882841970901756,-14.142676297161454,-0.08248810233032043,23.462992800762525] 124.92130092818479


In [37]:
features_df

Unnamed: 0,idx,name,coefficients
0,0,age,28.083973
1,1,day,3.305546
2,2,duration,0.248828
3,3,campaign,-14.142676
4,4,pdays,-0.082488
5,5,previous,23.462993


In [38]:
#prediction result
pred_result = reg_model.transform(df)

In [39]:
pred_result

DataFrame[balance: int, features: vector, age: int, day: int, duration: int, campaign: int, pdays: int, previous: int, prediction: double]

In [40]:
reg_model.summary.r2

0.01056811651155165

In [41]:
features_list

['age', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Variance Inflation factor

In [42]:
def vif_calculator(df, features_list):
    vif_list = []
    for i in features_list:
        temp_features_list = features_list.copy()
        temp_features_list.remove(i)
        temp_target = i
        assembler = VectorAssembler(inputCols=temp_features_list, outputCol='features')
        temp_df = assembler.transform(df)
        reg = LinearRegression(featuresCol='features', labelCol=i)
        reg_model = reg.fit(temp_df) # fit model
        temp_vif = 1/(1 - reg_model.summary.r2)
        vif_list.append(temp_vif)
    return vif_list

In [43]:
features_df['vif'] = vif_calculator(linear_df, features_list)

In [44]:
features_df

Unnamed: 0,idx,name,coefficients,vif
0,0,age,28.083973,1.000917
1,1,day,3.305546,1.03435
2,2,duration,0.248828,1.007627
3,3,campaign,-14.142676,1.039907
4,4,pdays,-0.082488,1.276182
5,5,previous,23.462993,1.261321


# Logistic Regression

In [45]:
target_variable_name = "y"
logistic_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y'])
#exclude target variable and select all other feature vectors
features_list = logistic_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
df = assemble_vectors(logistic_df, features_list, target_variable_name)

In [46]:
from pyspark.ml.classification import LogisticRegression
binary_clf = LogisticRegression(featuresCol='features', labelCol='y', family='binomial')
multinomial_clf = LogisticRegression(featuresCol='features', labelCol='y', family='multinomial')
binary_clf_model = binary_clf.fit(df) # fit binary model
multinomial_clf_model = multinomial_clf.fit(df) # fit multinomial model

In [47]:
import numpy as np
np.set_printoptions(precision=3, suppress=True)
binary_clf_model.coefficients

DenseVector([0.008, 0.0, -0.0017, 0.0036, -0.128, 0.0021, 0.0859])

In [48]:
np.set_printoptions(precision=4, suppress=True)
print(multinomial_clf_model.coefficientMatrix)

DenseMatrix([[-0.004 , -0.    ,  0.0008, -0.0018,  0.064 , -0.0011, -0.043 ],
             [ 0.004 ,  0.    , -0.0008,  0.0018, -0.064 ,  0.0011,  0.043 ]])


In [49]:
binary_clf_model.intercept

-3.4699010654247706

In [50]:
print(multinomial_clf_model.interceptVector)

[1.7349520795817952,-1.7349520795817952]


# Decision Trees

In [51]:
target_variable_name = "y"
logistic_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y'])
#exclude target variable and select all other feature vectors
features_list = logistic_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
binary_df = assemble_vectors(logistic_df, features_list, target_variable_name)

In [52]:
target_variable_name = "balance"
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])
#exclude target variable and select all other feature vectors
features_list = linear_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
continuous_df = assemble_vectors(linear_df, features_list, target_variable_name)

In [53]:
from pyspark.ml.classification import DecisionTreeClassifier

clf = DecisionTreeClassifier(featuresCol='features', labelCol='y', impurity='gini')
clf_model = clf.fit(binary_df)
clf2 = DecisionTreeClassifier(featuresCol='features', labelCol='y', impurity='entropy')
clf_model2 = clf2.fit(binary_df)

In [54]:
print(clf_model.featureImportances)

(7,[0,2,3,4,5],[0.061080523824779846,0.001401144004739516,0.725563927693075,0.0006452473752840746,0.2113091571021216])


In [55]:
print(clf_model2.featureImportances)

(7,[0,2,3,4,5],[0.016525034911207683,0.0008843867764161639,0.7259910167111736,0.00047544455167873026,0.25612411704952387])


In [56]:
from pyspark.ml.regression import DecisionTreeRegressor

reg = DecisionTreeRegressor(featuresCol='features', labelCol='balance', impurity='variance')
reg_model = reg.fit(continuous_df)

In [57]:
clf_model.toDebugString

'DecisionTreeClassificationModel (uid=DecisionTreeClassifier_c36bddc53faa) of depth 5 with 37 nodes\n  If (feature 3 <= 489.5)\n   If (feature 5 <= 9.5)\n    If (feature 0 <= 60.5)\n     Predict: 0.0\n    Else (feature 0 > 60.5)\n     If (feature 3 <= 129.5)\n      Predict: 0.0\n     Else (feature 3 > 129.5)\n      If (feature 3 <= 206.5)\n       Predict: 0.0\n      Else (feature 3 > 206.5)\n       Predict: 1.0\n   Else (feature 5 > 9.5)\n    If (feature 3 <= 180.5)\n     Predict: 0.0\n    Else (feature 3 > 180.5)\n     If (feature 5 <= 190.5)\n      If (feature 5 <= 96.5)\n       Predict: 1.0\n      Else (feature 5 > 96.5)\n       Predict: 0.0\n     Else (feature 5 > 190.5)\n      Predict: 0.0\n  Else (feature 3 > 489.5)\n   If (feature 3 <= 691.0)\n    If (feature 5 <= 8.5)\n     If (feature 3 <= 576.5)\n      If (feature 0 <= 60.5)\n       Predict: 0.0\n      Else (feature 0 > 60.5)\n       Predict: 1.0\n     Else (feature 3 > 576.5)\n      Predict: 0.0\n    Else (feature 5 > 8.5)\n

## String to Json parser

In [58]:
def parse(lines):
    block = []
    while lines :

        if lines[0].startswith('If'):
            bl = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
            block.append({'id':bl, 'children':parse(lines)})


            if lines[0].startswith('Else'):
                be = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
                block.append({'id':be, 'children':parse(lines)})
        elif not lines[0].startswith(('If','Else')):
            block2 = lines.pop(0)
            block.append({'id':block2})
        else:
            break
    return block

def tree_json(tree):
    data = []
    for line in tree.splitlines() : 
        if line.strip():
            line = line.strip()
            data.append(line)
        else : break
        if not line : break
    res = []
    res.append({'id':'Root', 'children':parse(data[1:])})
    return res[0]

In [59]:
result = tree_json(clf_model.toDebugString)

In [89]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StructField, StructType

cSchema = StructType([StructField("age", IntegerType())\
                      ,StructField("gender", IntegerType())\
                      ,StructField("y", IntegerType())])

test_list = [[30, 0, 1], 
             [25, 1, 0], 
             [45, 0, 0], 
             [57, 1, 1],
             [27, 0, 1], 
             [54, 1, 1], 
             [35, 1, 1]]


test_df = spark.createDataFrame(test_list, schema=cSchema)
test_df.show()

+---+------+---+
|age|gender|  y|
+---+------+---+
| 30|     0|  1|
| 25|     1|  0|
| 45|     0|  0|
| 57|     1|  1|
| 27|     0|  1|
| 54|     1|  1|
| 35|     1|  1|
+---+------+---+



In [90]:
test_df = assemble_vectors(test_df, ['age','gender'], 'y')
test_clf = DecisionTreeClassifier(featuresCol='features', labelCol='y')
test_clf_model = test_clf.fit(test_df)

In [94]:
clf_model.featureImportances

SparseVector(7, {0: 0.0611, 2: 0.0014, 3: 0.7256, 4: 0.0006, 5: 0.2113})

In [92]:
test_clf_model.toDebugString

'DecisionTreeClassificationModel (uid=DecisionTreeClassifier_2800c35732d6) of depth 3 with 7 nodes\n  If (feature 0 <= 26.0)\n   Predict: 0.0\n  Else (feature 0 > 26.0)\n   If (feature 0 <= 40.0)\n    Predict: 1.0\n   Else (feature 0 > 40.0)\n    If (feature 0 <= 49.5)\n     Predict: 0.0\n    Else (feature 0 > 49.5)\n     Predict: 1.0\n'

# Random Forest

In [96]:
from pyspark.ml.classification import RandomForestClassifier

clf = RandomForestClassifier(featuresCol='features', labelCol='y')
clf_model = clf.fit(binary_df)
print(clf_model.featureImportances)
print(clf_model.toDebugString)

(7,[0,1,2,3,4,5,6],[0.04089763741220326,0.00998065797800152,0.034217284862762806,0.6641622198323902,0.00418593339969165,0.19243787980500565,0.054118386709944885])
RandomForestClassificationModel (uid=RandomForestClassifier_8049ea3f17d3) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 3 <= 490.5)
     If (feature 6 <= 0.5)
      Predict: 0.0
     Else (feature 6 > 0.5)
      If (feature 5 <= 188.5)
       If (feature 3 <= 168.5)
        Predict: 0.0
       Else (feature 3 > 168.5)
        If (feature 0 <= 30.5)
         Predict: 1.0
        Else (feature 0 > 30.5)
         Predict: 0.0
      Else (feature 5 > 188.5)
       Predict: 0.0
    Else (feature 3 > 490.5)
     If (feature 5 <= 28.0)
      If (feature 0 <= 32.5)
       If (feature 3 <= 913.0)
        Predict: 0.0
       Else (feature 3 > 913.0)
        If (feature 2 <= 2.5)
         Predict: 0.0
        Else (feature 2 > 2.5)
         Predict: 1.0
      Else (feature 0 > 32.5)
       If (feature 0 <= 59.5)
        Predict: 

In [97]:
from pyspark.ml.regression import RandomForestRegressor

reg = RandomForestRegressor(featuresCol='features', labelCol='balance')
reg_model = reg.fit(continuous_df)
print(reg_model.featureImportances)
print(reg_model.toDebugString)

(6,[0,1,2,3,4,5],[0.41892201462152007,0.21670083472491117,0.10765896383704084,0.06965690771303537,0.10482267536078857,0.08223860374270413])
RandomForestRegressionModel (uid=RandomForestRegressor_bbf6bb4d5022) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 1 <= 21.5)
     If (feature 1 <= 18.5)
      If (feature 5 <= 2.5)
       If (feature 0 <= 50.5)
        If (feature 0 <= 35.5)
         Predict: 1048.641155545974
        Else (feature 0 > 35.5)
         Predict: 1275.4021535580525
       Else (feature 0 > 50.5)
        If (feature 3 <= 17.5)
         Predict: 1775.0430576631259
        Else (feature 3 > 17.5)
         Predict: 5520.434782608696
      Else (feature 5 > 2.5)
       If (feature 1 <= 1.5)
        If (feature 5 <= 3.5)
         Predict: 8022.733333333334
        Else (feature 5 > 3.5)
         Predict: 2771.757575757576
       Else (feature 1 > 1.5)
        If (feature 1 <= 6.5)
         Predict: 1141.8323782234957
        Else (feature 1 > 6.5)
         Predict: 1

# Gradient boosting

In [None]:
from pyspark.ml.classification import GBTClassifier

clf = GBTClassifier(featuresCol='features', labelCol='y')
clf_model = clf.fit(binary_df)
print(clf_model.featureImportances)
print(clf_model.toDebugString)

In [None]:
from pyspark.ml.regression import GBTRegressor

reg = GBTRegressor(featuresCol='features', labelCol='balance')
reg_model = reg.fit(continuous_df)
print(reg_model.featureImportances)
print(reg_model.toDebugString)

# Support vector machines

In [103]:
from pyspark.ml.classification import LinearSVC
np.set_printoptions(precision=3, suppress=True)
clf = LinearSVC(featuresCol='features', labelCol='y')
clf_model = clf.fit(binary_df)
print(clf_model.intercept, clf_model.coefficients)

-1.0149273280183022 [0.000187712606328283,5.2578830515471725e-09,-7.9482226780511e-05,2.0908016936147597e-05,-0.0006652180215746111,3.426437268921949e-06,0.0003418718600226912]


In [104]:
import numpy as np
np.set_printoptions(precision=3, suppress=True)
binary_clf_model.coefficients

DenseVector([0.008, 0.0, -0.0017, 0.0036, -0.128, 0.0021, 0.0859])

# Neural networks

In [121]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

#output_layer is set to 2 because of binary target
clf = MultilayerPerceptronClassifier(featuresCol='features', labelCol='y', layers=[4, 4, 2])
clf_model = clf.fit(binary_df)

# One vs rest classifier

In [131]:
target_variable_name = "education"
multiclass_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'job', 'education'])
features_list = multiclass_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
multiclass_df = assemble_vectors(multiclass_df, features_list, target_variable_name)

In [135]:
from pyspark.ml.classification import RandomForestClassifier, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# generate the train/test split.
(train, test) = multiclass_df.randomSplit([0.7, 0.3])
# instantiate the base classifier.
clf = RandomForestClassifier(featuresCol='features', labelCol='education')
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=clf, featuresCol='features', labelCol='education')
# train the multiclass model.
ovrModel = ovr.fit(train)
# score the model on test data.
predictions = ovrModel.transform(test)
# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy", labelCol='education')
# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.332324


# Naive Bayes classifier

In [139]:
target_variable_name = "y"
nonneg_df = data.select(['age', 'day', 'duration', 'campaign', 'previous', 'y'])
#exclude target variable and select all other feature vectors
features_list = nonneg_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
nonneg_df = assemble_vectors(nonneg_df, features_list, target_variable_name)

In [140]:
from pyspark.ml.classification import NaiveBayes

#output_layer is set to 2 because of binary target
clf = NaiveBayes(featuresCol='features', labelCol='y')
clf_model = clf.fit(nonneg_df)