#### Import packages and start Spark Session

In [23]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import *
import pandas as pd 
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Imputer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = SparkSession.builder.appName('random_forest_spark').getOrCreate()

#### Get External dataset

In [2]:
url = "https://storage.googleapis.com/bdt-spark-store/external_sources.csv"
spark.sparkContext.addFile(url)
df_ext = spark.read.csv("file:///"+SparkFiles.get("external_sources.csv"), header=True, inferSchema= True)
print((df_ext.count(), len(df_ext.columns)))

(307511, 4)


#### Get Data dataset

In [3]:
url = "https://storage.googleapis.com/bdt-spark-store/internal_data.csv"
spark.sparkContext.addFile(url)
df_data  = spark.read.csv("file:///"+SparkFiles.get("internal_data.csv"), header=True, inferSchema= True)
print((df_data.count(), len(df_data.columns)))

(307511, 119)


#### Join datasets

In [61]:
df_full = df_data.join(df_ext, on='SK_ID_CURR', how='inner')
print((df_full.count(), len(df_full.columns)))

(307511, 122)


#### Filter for columns required

In [62]:
columns_extract = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                  'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
                  'DAYS_ID_PUBLISH', 'CODE_GENDER', 'AMT_ANNUITY',
                  'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
                  'ORGANIZATION_TYPE', 'DAYS_LAST_PHONE_CHANGE',
                  'NAME_INCOME_TYPE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE', 'TARGET']
df = df_full[columns_extract]

#### One-hot Encoding Categorical Variables

In [63]:
categorical_variables = ['NAME_EDUCATION_TYPE', 'CODE_GENDER', 'ORGANIZATION_TYPE', 'NAME_INCOME_TYPE']
indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables]
encoder = OneHotEncoder(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="categorical-features"
)

cat_var = assembler

pipeline = Pipeline(stages=indexers + [encoder, assembler])
df_oh = pipeline.fit(df).transform(df)

#### Split into test and train

In [64]:
train, test = df_oh.randomSplit([0.8, 0.2], seed=101)
print('Training data shape: ', (train.count(), len(train.columns)))
print('Testing data shape: ', (test.count(), len(test.columns)))

Training data shape:  (246051, 27)
Testing data shape:  (61460, 27)


#### Calculate target variable proportions in each set

In [65]:
train_prop = train.groupBy('TARGET').count()
test_prop = test.groupBy('TARGET').count()
print(train_prop.withColumn('train_proportion', train_prop['count'] / train.count()).show())
print(test_prop.withColumn('test_proportion', test_prop['count'] / test.count()).show())

+------+------+-------------------+
|TARGET| count|   train_proportion|
+------+------+-------------------+
|     1| 19861|0.08071903792303221|
|     0|226190| 0.9192809620769677|
+------+------+-------------------+

None
+------+-----+-------------------+
|TARGET|count|    test_proportion|
+------+-----+-------------------+
|     1| 4964|0.08076797917344615|
|     0|56496| 0.9192320208265539|
+------+-----+-------------------+

None


#### Impute missing values

In [66]:
imputer = Imputer()

imputer.setInputCols(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_LAST_PHONE_CHANGE', 'OWN_CAR_AGE'])

imputer.setOutputCols(['out_EXT_SOURCE_1', 'out_EXT_SOURCE_2', 'out_EXT_SOURCE_3', 'out_AMT_ANNUITY', 'out_AMT_GOODS_PRICE', 'out_DAYS_LAST_PHONE_CHANGE', 'out_OWN_CAR_AGE'])

imputer.getRelativeError()

imputer.setStrategy('median')

model = imputer.fit(train)
model.setInputCols(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_LAST_PHONE_CHANGE', 'OWN_CAR_AGE'])
train_filled = model.transform(train)

model = imputer.fit(test)
model.setInputCols(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_LAST_PHONE_CHANGE', 'OWN_CAR_AGE'])
test_filled = model.transform(test)

#### Drop redundant columns

In [67]:
columns_to_drop = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_LAST_PHONE_CHANGE', 'OWN_CAR_AGE']
train_filled_dropped = train_filled.drop(*columns_to_drop)
test_filled_dropped = test_filled.drop(*columns_to_drop)

#### Combine categorical vector with continuous variables

In [68]:
continuous_variables = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION',
                        'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'TARGET', 'out_AMT_ANNUITY',
                        'out_DAYS_LAST_PHONE_CHANGE', 'out_EXT_SOURCE_1', 'out_OWN_CAR_AGE',
                        'out_EXT_SOURCE_2', 'out_AMT_GOODS_PRICE', 'out_EXT_SOURCE_3']

assembler = VectorAssembler(inputCols=['categorical-features', *continuous_variables], outputCol='features')

con_var = assembler
train_filled_dropped_comb = assembler.transform(train_filled_dropped)
test_filled_dropped_comb = assembler.transform(test_filled_dropped)

In [69]:
feature_names = cat_var.getInputCols() + con_var.getInputCols()

#### Scale numerical variables

In [70]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

scaler_model = scaler.fit(train_filled_dropped_comb)
train_final = scaler_model.transform(train_filled_dropped_comb)

scaler_model = scaler.fit(test_filled_dropped_comb)
test_final = scaler_model.transform(test_filled_dropped_comb)

In [74]:
train_final.head()

Row(DAYS_BIRTH=-15272, DAYS_EMPLOYED=-337, NAME_EDUCATION_TYPE='Secondary / secondary special', DAYS_ID_PUBLISH=-3622, CODE_GENDER='M', DAYS_REGISTRATION=-437.0, AMT_CREDIT=263686.5, ORGANIZATION_TYPE='Business Entity Type 3', NAME_INCOME_TYPE='Working', AMT_INCOME_TOTAL=90000.0, TARGET=0, NAME_EDUCATION_TYPE-index=0.0, CODE_GENDER-index=1.0, ORGANIZATION_TYPE-index=0.0, NAME_INCOME_TYPE-index=0.0, NAME_EDUCATION_TYPE-index-encoded=SparseVector(4, {0: 1.0}), CODE_GENDER-index-encoded=SparseVector(2, {1: 1.0}), ORGANIZATION_TYPE-index-encoded=SparseVector(57, {0: 1.0}), NAME_INCOME_TYPE-index-encoded=SparseVector(7, {0: 1.0}), categorical-features=SparseVector(70, {0: 1.0, 5: 1.0, 6: 1.0, 63: 1.0}), out_AMT_ANNUITY=17298.0, out_DAYS_LAST_PHONE_CHANGE=-235.0, out_EXT_SOURCE_1=0.5051892165955818, out_OWN_CAR_AGE=9.0, out_EXT_SOURCE_2=0.5655961111797336, out_AMT_GOODS_PRICE=238500.0, out_EXT_SOURCE_3=0.5352762504724826, features=SparseVector(84, {0: 1.0, 5: 1.0, 6: 1.0, 63: 1.0, 70: -15272

#### Train Random Forest Classifier

In [54]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="TARGET", featuresCol="scaled_features", numTrees=100, seed=50, impurity='gini', )

model = rf.fit(train_final)

#### Make predictions

In [55]:
predictions = model.transform(test_final).select("TARGET", "prediction")

#### Model Evaluation

In [60]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
# acc = evaluator.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
print("Accuracy = %g" % (accuracy))

Accuracy = 0.966743


#### Calculate feature importances

In [59]:
print(model.featureImportances)

(84,[0,1,2,3,4,5,6,7,8,10,13,14,16,19,21,22,24,26,27,29,30,31,33,35,36,37,38,40,41,42,43,45,46,47,48,49,52,53,54,55,57,59,63,64,65,66,70,71,72,73,74,75,76,77,78,79,80,81,82,83],[0.0020351836887785766,0.003414022670935533,9.196298726670885e-07,5.0116962893866196e-05,0.007395942585985196,0.0016321986267239985,3.850109945755476e-05,0.0012744426132457506,0.00027373763063220723,1.1033137315551346e-05,9.587951942054302e-05,8.592478739609784e-06,1.453948570030592e-05,4.072256739968085e-06,4.965041659466955e-06,1.1469854789092493e-05,5.890521440870412e-06,4.242745928231173e-06,2.521227758916999e-07,3.4358979065826033e-07,2.2530409655537502e-05,1.2540796557951034e-05,9.742120257585452e-05,1.0204240323891398e-06,0.00043924642955036724,5.0427921942028874e-05,6.720967785087333e-05,3.970115331517087e-06,2.759677541588403e-05,1.4763708415381342e-06,2.2361965318388316e-06,9.372916092265496e-07,2.425087300563969e-06,1.4208856961908024e-06,6.398930320568998e-06,2.406870223871948e-06,6.1775825775334696e