In [1]:
import pyspark
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer,  VectorIndexer
from pyspark.ml.feature import VectorAssembler, Imputer
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [2]:
! wget https://storage.googleapis.com/bdt-spark-store/external_sources.csv -O gcs_external_sources.csv

--2021-11-26 09:50:30--  https://storage.googleapis.com/bdt-spark-store/external_sources.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.170.112, 216.58.223.144, 172.217.170.16, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.170.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15503836 (15M) [text/csv]
Saving to: ‘gcs_external_sources.csv’


2021-11-26 09:50:33 (6.49 MB/s) - ‘gcs_external_sources.csv’ saved [15503836/15503836]



In [3]:
! wget https://storage.googleapis.com/bdt-spark-store/internal_data.csv -O gcs_internal_data.csv

--2021-11-26 09:50:34--  https://storage.googleapis.com/bdt-spark-store/internal_data.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.223.144, 172.217.170.16, 172.217.170.48, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.223.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152978396 (146M) [text/csv]
Saving to: ‘gcs_internal_data.csv’


2021-11-26 09:50:49 (10.5 MB/s) - ‘gcs_internal_data.csv’ saved [152978396/152978396]



In [4]:
import os
memory = '8g'
pyspark_submit_args = ' --verbose \
 --driver-cores 1 \
 --driver-memory=16g \
 --executor-memory 2g \
 --num-executors 4 \
 --executor-cores 1 \
 --master local[4] \
 --deploy-mode client \
 pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

sc = SparkSession.builder.appName('BDT_PBA3').getOrCreate()

In [5]:
internal_df = sc.read.csv('data/gcs_internal_data.csv', inferSchema=True, header=True)

In [6]:
external_df = sc.read.csv('data/gcs_external_sources.csv', inferSchema=True, header=True)

In [7]:
sc.conf.set("spark.sql.crossJoin.enabled", "true")
full_df = internal_df.join(external_df, internal_df.SK_ID_CURR==external_df.SK_ID_CURR)

In [8]:
full_df.printSchema()

root
 |-- SK_ID_CURR: integer (nullable = true)
 |-- TARGET: integer (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: integer (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- AMT_GOODS_PRICE: double (nullable = true)
 |-- NAME_TYPE_SUITE: string (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- REGION_POPULATION_RELATIVE: double (nullable = true)
 |-- DAYS_BIRTH: integer (nullable = true)
 |-- DAYS_EMPLOYED: integer (nullable = true)
 |-- DAYS_REGISTRATION: double (nullable = true)
 |-- DAYS_ID_PUBLISH: integer (nullable = true)
 |-- OWN_CAR_AG

In [9]:
full_df.select('TARGET').distinct().show()

+------+
|TARGET|
+------+
|     1|
|     0|
+------+



In [10]:
columns_extract = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                  'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
                  'DAYS_ID_PUBLISH', 'CODE_GENDER', 'AMT_ANNUITY',
                  'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
                  'ORGANIZATION_TYPE', 'DAYS_LAST_PHONE_CHANGE',
                  'NAME_INCOME_TYPE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE', 'TARGET']
df = full_df.select(columns_extract)

In [11]:
df.show(n=3)

+------------------+------------------+-------------------+----------+-------------+--------------------+---------------+-----------+-----------+-----------------+---------------+----------+-----------------+----------------------+----------------+----------------+-----------+------+
|      EXT_SOURCE_1|      EXT_SOURCE_2|       EXT_SOURCE_3|DAYS_BIRTH|DAYS_EMPLOYED| NAME_EDUCATION_TYPE|DAYS_ID_PUBLISH|CODE_GENDER|AMT_ANNUITY|DAYS_REGISTRATION|AMT_GOODS_PRICE|AMT_CREDIT|ORGANIZATION_TYPE|DAYS_LAST_PHONE_CHANGE|NAME_INCOME_TYPE|AMT_INCOME_TOTAL|OWN_CAR_AGE|TARGET|
+------------------+------------------+-------------------+----------+-------------+--------------------+---------------+-----------+-----------+-----------------+---------------+----------+-----------------+----------------------+----------------+----------------+-----------+------+
|0.6529430442014209|0.6984528683753916| 0.8004513396487078|    -15612|        -1106|Secondary / secon...|          -4018|          M|    38281.5|

In [12]:
numbers = ["EXT_SOURCE_1", 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'AMT_ANNUITY', 'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT', 'DAYS_LAST_PHONE_CHANGE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE']

for col in numbers:
    df = df.withColumn(col, F.col(col).cast('double'))

In [13]:
train, test = df.randomSplit([0.8, 0.2],101)

In [14]:
train.groupBy('TARGET').count().select("TARGET","count", (F.col("count")/train.count()).alias("Prevelence")).show()
test.groupBy('TARGET').count().select("TARGET","count", (F.col("count")/test.count()).alias("Prevelence")).show()

+------+------+-------------------+
|TARGET| count|         Prevelence|
+------+------+-------------------+
|     1| 19890|0.08089640867124903|
|     0|225980|  0.919103591328751|
+------+------+-------------------+

+------+-----+-------------------+
|TARGET|count|         Prevelence|
+------+-----+-------------------+
|     1| 4935|0.08006034944274103|
|     0|56706|  0.919939650557259|
+------+-----+-------------------+



In [15]:
print('Training data shape: ', (train.count(), len(train.columns)))
print('Test data shape: ',(test.count(), len(test.columns)))

Training data shape:  (245870, 18)
Test data shape:  (61641, 18)


In [16]:
string_columns = ["CODE_GENDER", "NAME_EDUCATION_TYPE", "ORGANIZATION_TYPE", "NAME_INCOME_TYPE"]
numbers = ["EXT_SOURCE_1", 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'AMT_ANNUITY', 'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT', 'DAYS_LAST_PHONE_CHANGE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE']

train = train.na.fill('missing', subset=string_columns)
test = test.na.fill('missing', subset=string_columns)

train = train.na.fill(999999999999999.99, subset=numbers)
test = test.na.fill(999999999999999.99, subset=numbers)

In [17]:
string_indexes = [StringIndexer(inputCol=c, outputCol='IDX_' + c, handleInvalid='keep',
                                    stringOrderType='alphabetAsc') for c in string_columns]
imputer = Imputer(strategy='mean', missingValue=999999999999999.99,
                      inputCols=numbers, outputCols=['imp_' + n for n in numbers])
assembler = VectorAssembler(outputCol="features",inputCols=['imp_' + n for n in numbers] + ['IDX_' + c for c in string_columns])
vector_indexes = VectorIndexer(inputCol="features", outputCol='VIX_features', handleInvalid='keep',
                                    maxCategories=5)
rf = RandomForestClassifier(featuresCol="VIX_features", cacheNodeIds=True, labelCol='TARGET', maxBins=61, numTrees=1000, impurity='entropy')

In [18]:
pipeline_stages = [string_indexes + [imputer, assembler,vector_indexes , rf]]

In [19]:
pipeline = Pipeline(stages=pipeline_stages[0]).fit(train)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36371)
Traceback (most recent call last):
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-c24f641eaca3>", line 1, in <module>
    pipeline = Pipeline(stages=pipeline_stages[0]).fit(train)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/pipeline.py", line 109, in _fit
    model = stage.fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/wrapper.py", line 288, in _fit
    java_model = self._fit_java(dataset)
  File "/home/f5047145/anaco

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36371)
Traceback (most recent call last):
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-c24f641eaca3>", line 1, in <module>
    pipeline = Pipeline(stages=pipeline_stages[0]).fit(train)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/pipeline.py", line 109, in _fit
    model = stage.fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/wrapper.py", line 288, in _fit
    java_model = self._fit_java(dataset)
  File "/home/f5047145/anaco

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:36371)
Traceback (most recent call last):
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-c24f641eaca3>", line 1, in <module>
    pipeline = Pipeline(stages=pipeline_stages[0]).fit(train)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/pipeline.py", line 109, in _fit
    model = stage.fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/home/f5047145/anaconda3/lib/python3.7/site-packages/pyspark/ml/wrapper.py", line 288, in _fit
    java_model = self._fit_java(dataset)
  File "/home/f5047145/anaco

Py4JError: An error occurred while calling o172.fit

In [None]:
test_scored = pipeline.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='TARGET', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(test_scored)
print(accuracy)

In [None]:
import pandas as pd
featuresCol = "VIX_features"
dataset = test_scored
featureImp = pipeline.stages[-1].featureImportances

list_extract = []
for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
    list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
varlist = pd.DataFrame(list_extract)
varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
varlist.sort_values(by='score', ascending=False)

In [None]:
import pandas as pd
featuresCol = "features"
dataset = test_scored
featureImp = pipeline.stages[-1].featureImportances

list_extract = []
for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
    list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
varlist = pd.DataFrame(list_extract)
varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
varlist.sort_values(by='score', ascending=False)

In [None]:
from handyspark import *
test_hdf = test_scored.toHandy()
test_hdf.cache()
test_bmc = BinaryClassificationMetrics(test_hdf, scoreCol='probability', labelCol='TARGET')
test_bmc.print_confusion_matrix(0.5)