In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=89da142953e24304a035cb0fb5fa593cfc9d6ceb69b64b7a70eaa9509a586204
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

! wget https://storage.googleapis.com/bdt-spark-store/external_sources.csv -O gcs_external_sources.csv
! wget https://storage.googleapis.com/bdt-spark-store/internal_data.csv -O gcs_internal_data.csv
spark = SparkSession.builder.appName("Credit").getOrCreate()
df_data = spark.read.csv("gcs_internal_data.csv", header=True, inferSchema=True)
df_ext = spark.read.csv("gcs_external_sources.csv", header=True, inferSchema=True)

df_full = df_data.join(df_ext, on='SK_ID_CURR', how='inner')

columns_extract = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                  'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
                  'DAYS_ID_PUBLISH', 'CODE_GENDER', 'AMT_ANNUITY',
                  'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
                  'ORGANIZATION_TYPE', 'DAYS_LAST_PHONE_CHANGE',
                  'NAME_INCOME_TYPE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE', 'TARGET']

df = df_full.select(columns_extract)

seed = 101
train, test = df.randomSplit([0.8, 0.2], seed=seed)

categorical_columns = ['NAME_EDUCATION_TYPE', 'CODE_GENDER', 'ORGANIZATION_TYPE', 'NAME_INCOME_TYPE']

indexers = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep") for col in categorical_columns]

for indexer in indexers:
    train = indexer.fit(train).transform(train)
    test = indexer.fit(test).transform(test)

encoder = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded", dropLast=False) for col in categorical_columns]

for enc in encoder:
    train = enc.fit(train).transform(train)
    test = enc.fit(test).transform(test)

input_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
              'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',
              'AMT_ANNUITY', 'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
              'DAYS_LAST_PHONE_CHANGE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE'] + \
             [col + "_encoded" for col in categorical_columns]

assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
train = assembler.setParams(handleInvalid="skip").transform(train)
test = assembler.setParams(handleInvalid="skip").transform(test)

train = train.select("TARGET", "features")
test = test.select("TARGET", "features")

random_forest = RandomForestClassifier(numTrees=100, featuresCol="features", labelCol="TARGET", seed=50)
model = random_forest.fit(train)
predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", (accuracy*100))

model.save("random_forest_model")
spark.stop()

--2023-11-06 21:11:18--  https://storage.googleapis.com/bdt-spark-store/external_sources.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 209.85.146.207, 209.85.147.207, 142.250.125.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|209.85.146.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15503836 (15M) [text/csv]
Saving to: ‘gcs_external_sources.csv’


2023-11-06 21:11:19 (14.0 MB/s) - ‘gcs_external_sources.csv’ saved [15503836/15503836]

--2023-11-06 21:11:19--  https://storage.googleapis.com/bdt-spark-store/internal_data.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 209.85.146.207, 209.85.147.207, 142.250.125.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|209.85.146.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152978396 (146M) [text/csv]
Saving to: ‘gcs_internal_data.csv’


2023-11-06 21:11:25 (29.3 MB/s) - ‘gcs_internal_data.csv’ saved [152