# Default Parameters

In [1]:
filename = "bank-full.csv"
target_variable_name = "y"

# Load Dataset

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

# Identify variable types

In [3]:
def variable_type(df):
    
    vars_list = df.dtypes
    char_vars = []
    num_vars = []
    for i in vars_list:
        if i[1] in ('string'):
            char_vars.append(i[0])
        else:
            num_vars.append(i[0])
    
    return char_vars, num_vars

In [4]:
char_vars, num_vars = variable_type(df)

In [5]:
char_vars

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [6]:
num_vars

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [7]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

def category_to_index(df, char_vars):
    
    char_df = df.select(char_vars)
    indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in char_df.columns]
    pipeline = Pipeline(stages=indexers)
    char_labels = pipeline.fit(char_df)
    df = char_labels.transform(df)
    return df, char_labels

In [8]:
df, char_labels = category_to_index(df, char_vars)

In [9]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string'),
 ('job_index', 'double'),
 ('marital_index', 'double'),
 ('education_index', 'double'),
 ('default_index', 'double'),
 ('housing_index', 'double'),
 ('loan_index', 'double'),
 ('contact_index', 'double'),
 ('month_index', 'double'),
 ('poutcome_index', 'double'),
 ('y_index', 'double')]

In [10]:
df = df.select([c for c in df.columns if c not in char_vars])

In [11]:
from pyspark.sql.functions import col

def rename_columns(df, char_vars):
    mapping = dict(zip([i + '_index' for i in char_vars], char_vars))
    df = df.select([col(c).alias(mapping.get(c, c)) for c in df.columns])
    return df

In [12]:
df = rename_columns(df, char_vars)

In [13]:
df.dtypes

[('age', 'int'),
 ('balance', 'int'),
 ('day', 'int'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('job', 'double'),
 ('marital', 'double'),
 ('education', 'double'),
 ('default', 'double'),
 ('housing', 'double'),
 ('loan', 'double'),
 ('contact', 'double'),
 ('month', 'double'),
 ('poutcome', 'double'),
 ('y', 'double')]

In [14]:
df.groupBy('y').count().show() 

+---+-----+
|  y|count|
+---+-----+
|0.0|39922|
|1.0| 5289|
+---+-----+



# Assemble input vectors

In [15]:
from pyspark.ml.feature import VectorAssembler

#assemble individual columns to one column - 'features'
def assemble_vectors(df, features_list, target_variable_name):
    stages = []
    #assemble vectors
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    stages = [assembler]
    #select all the columns + target + newly created 'features' column
    selectedCols = [target_variable_name, 'features'] + features_list
    #use pipeline to process sequentially
    pipeline = Pipeline(stages=stages)
    #assembler model
    assembleModel = pipeline.fit(df)
    #apply assembler model on data
    df = assembleModel.transform(df).select(selectedCols)

    return df

In [16]:
#exclude target variable and select all other feature vectors
features_list = df.columns
features_list.remove(target_variable_name)

In [17]:
features_list

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [18]:
# apply the function on our dataframe
df = assemble_vectors(df, features_list, target_variable_name)

In [19]:
df.show()

+---+--------------------+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+
|  y|            features|age|balance|day|duration|campaign|pdays|previous| job|marital|education|default|housing|loan|contact|month|poutcome|
+---+--------------------+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+
|0.0|(16,[0,1,2,3,4,5,...| 58|   2143|  5|     261|       1|   -1|       0| 1.0|    0.0|      1.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|
|0.0|(16,[0,1,2,3,4,5,...| 44|     29|  5|     151|       1|   -1|       0| 2.0|    1.0|      0.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|
|0.0|(16,[0,1,2,3,4,5,...| 33|      2|  5|      76|       1|   -1|       0| 7.0|    0.0|      0.0|    0.0|    0.0| 1.0|    1.0|  0.0|     0.0|
|0.0|(16,[0,1,2,3,4,5,...| 47|   1506|  5|      92|       1|   -1|       0| 0.0|    0.0|      3.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|

In [20]:
df.schema["features"].metadata["ml_attr"]["attrs"]

{'numeric': [{'idx': 0, 'name': 'age'},
  {'idx': 1, 'name': 'balance'},
  {'idx': 2, 'name': 'day'},
  {'idx': 3, 'name': 'duration'},
  {'idx': 4, 'name': 'campaign'},
  {'idx': 5, 'name': 'pdays'},
  {'idx': 6, 'name': 'previous'},
  {'idx': 7, 'name': 'job'},
  {'idx': 8, 'name': 'marital'},
  {'idx': 9, 'name': 'education'},
  {'idx': 10, 'name': 'default'},
  {'idx': 11, 'name': 'housing'},
  {'idx': 12, 'name': 'loan'},
  {'idx': 13, 'name': 'contact'},
  {'idx': 14, 'name': 'month'},
  {'idx': 15, 'name': 'poutcome'}]}

In [21]:
import pandas as pd
for k, v in df.schema["features"].metadata["ml_attr"]["attrs"].items():
    features_df = pd.DataFrame(v)

In [22]:
features_df

Unnamed: 0,idx,name
0,0,age
1,1,balance
2,2,day
3,3,duration
4,4,campaign
5,5,pdays
6,6,previous
7,7,job
8,8,marital
9,9,education


# Model based feature selection

# Question 1: Implement decision tree feature importance. Compare and contrast with Random Forest output.

In [23]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol='features', labelCol=target_variable_name)
dt_model = dt.fit(df)
dt_model.featureImportances

SparseVector(16, {0: 0.0072, 1: 0.0014, 3: 0.5485, 13: 0.0182, 14: 0.1159, 15: 0.3088})

In [24]:
#temporary output rf_output
dt_output = dt_model.featureImportances
features_df['Decision_Tree'] = features_df['idx'].apply(lambda x: dt_output[x] if x in dt_output.indices else 0)

In [26]:
#sort values based on descending importance feature
features_df.sort_values("Decision_Tree", ascending=False, inplace=True)

In [27]:
features_df

Unnamed: 0,idx,name,Decision_Tree
3,3,duration,0.548488
15,15,poutcome,0.308838
14,14,month,0.115929
13,13,contact,0.018157
0,0,age,0.007224
1,1,balance,0.001364
2,2,day,0.0
4,4,campaign,0.0
5,5,pdays,0.0
6,6,previous,0.0


# Question 2: Implement gradient boosted tree feature importance. Compare and contrast with Random Forest output.

In [28]:
from pyspark.ml.classification import GBTClassifier
gb = GBTClassifier(featuresCol='features', labelCol=target_variable_name)
gb_model = gb.fit(df)
gb_model.featureImportances

SparseVector(16, {0: 0.072, 1: 0.0421, 2: 0.0819, 3: 0.3081, 4: 0.0148, 5: 0.0451, 6: 0.012, 7: 0.0164, 8: 0.0113, 9: 0.004, 10: 0.0025, 11: 0.0619, 12: 0.0134, 13: 0.0923, 14: 0.1948, 15: 0.0275})

In [29]:
#temporary output rf_output
gb_output = gb_model.featureImportances
features_df['Gradient Boosting'] = features_df['idx'].apply(lambda x: gb_output[x] if x in gb_output.indices else 0)

In [31]:
#sort values based on descending importance feature
features_df.sort_values("Gradient Boosting", ascending=False, inplace=True)

In [32]:
features_df

Unnamed: 0,idx,name,Decision_Tree,Gradient Boosting
3,3,duration,0.548488,0.308067
14,14,month,0.115929,0.194815
13,13,contact,0.018157,0.092292
2,2,day,0.0,0.081915
0,0,age,0.007224,0.071962
11,11,housing,0.0,0.061876
5,5,pdays,0.0,0.045099
1,1,balance,0.001364,0.042121
15,15,poutcome,0.308838,0.027456
7,7,job,0.0,0.016429


# Question 3: Implement logistic regression feature importance. Compare and contrast with Random Forest output. (Hint: Use the coefficient of logistic regression. Note: Logistic regression produces Dense Vector instead of sparse vector)

In [33]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol=target_variable_name)
lr_model = lr.fit(df)

In [34]:
lr_model.coefficients

DenseVector([-0.0012, 0.0, -0.0045, 0.0041, -0.0801, -0.0015, -0.0071, 0.0206, 0.1676, -0.002, -0.1827, 0.8045, -0.5544, -0.3502, 0.1533, 0.8414])

In [35]:
#temporary output rf_output
lr_output = lr_model.coefficients
#absolute value is used to convert the negative coefficients. This should be done only for feature importance.
features_df['Logistic Regression'] = features_df['idx'].apply(lambda x: abs(lr_output[x])) 

In [36]:
features_df.sort_values("Logistic Regression", ascending=False, inplace=True)

In [37]:
features_df

Unnamed: 0,idx,name,Decision_Tree,Gradient Boosting,Logistic Regression
15,15,poutcome,0.308838,0.027456,0.841399
11,11,housing,0.0,0.061876,0.804526
12,12,loan,0.0,0.013369,0.55443
13,13,contact,0.018157,0.092292,0.350192
10,10,default,0.0,0.002462,0.182729
8,8,marital,0.0,0.011289,0.167597
14,14,month,0.115929,0.194815,0.153337
4,4,campaign,0.0,0.014785,0.080138
7,7,job,0.0,0.016429,0.020619
6,6,previous,0.0,0.012037,0.007054


# Random forest addition for voting based selection

In [38]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol='features', labelCol=target_variable_name)
rf_model = rf.fit(df)
rf_model.featureImportances

SparseVector(16, {0: 0.0548, 1: 0.0072, 2: 0.0059, 3: 0.3071, 4: 0.0039, 5: 0.0204, 6: 0.0379, 7: 0.0073, 8: 0.0017, 9: 0.0005, 10: 0.0002, 11: 0.0243, 12: 0.0025, 13: 0.0179, 14: 0.1643, 15: 0.344})

In [39]:
#temporary output rf_output
rf_output = rf_model.featureImportances
features_df['Random Forest'] = features_df['idx'].apply(lambda x: rf_output[x] if x in rf_output.indices else 0)

In [40]:
#sort values based on descending importance feature
features_df.sort_values("Random Forest", ascending=False, inplace=True)

In [41]:
features_df

Unnamed: 0,idx,name,Decision_Tree,Gradient Boosting,Logistic Regression,Random Forest
15,15,poutcome,0.308838,0.027456,0.841399,0.343953
3,3,duration,0.548488,0.308067,0.004053,0.307135
14,14,month,0.115929,0.194815,0.153337,0.164314
0,0,age,0.007224,0.071962,0.001205,0.05484
6,6,previous,0.0,0.012037,0.007054,0.037937
11,11,housing,0.0,0.061876,0.804526,0.024287
5,5,pdays,0.0,0.045099,0.001516,0.020385
13,13,contact,0.018157,0.092292,0.350192,0.017885
7,7,job,0.0,0.016429,0.020619,0.007331
1,1,balance,0.001364,0.042121,1.8e-05,0.007166


# Voting based selection

In [42]:
features_df.drop('idx', axis=1, inplace=True)

In [44]:
num_top_features = 7
columns = ['Decision_Tree', 'Gradient Boosting', 'Logistic Regression', 'Random Forest']
score_table = pd.DataFrame({},[])
score_table['name'] = features_df['name']
for i in columns:
    score_table[i] = features_df['name'].isin(list(features_df.nlargest(num_top_features,i)['name'])).astype(int)

In [45]:
score_table['final_score'] = score_table.sum(axis=1)
score_table.sort_values('final_score',ascending=0)

Unnamed: 0,name,Decision_Tree,Gradient Boosting,Logistic Regression,Random Forest,final_score
14,month,1,1,1,1,4
15,poutcome,1,0,1,1,3
3,duration,1,1,0,1,3
0,age,1,1,0,1,3
11,housing,0,1,1,1,3
13,contact,1,1,1,0,3
6,previous,1,0,0,1,2
5,pdays,0,1,0,1,2
1,balance,1,0,0,0,1
2,day,0,1,0,0,1
