In [6]:
!pip install kedro
!pip install pyspark

!kedro new

from kedro.pipeline import node, Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


def run_pyspark_pipeline():
    spark = SparkSession.builder.appName("Credit").getOrCreate()
    df_data = spark.read.csv("gcs_internal_data.csv", header=True, inferSchema=True)
    df_ext = spark.read.csv("gcs_external_sources.csv", header=True, inferSchema=True)

    df_full = df_data.join(df_ext, on='SK_ID_CURR', how='inner')

    columns_extract = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                      'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
                      'DAYS_ID_PUBLISH', 'CODE_GENDER', 'AMT_ANNUITY',
                      'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
                      'ORGANIZATION_TYPE', 'DAYS_LAST_PHONE_CHANGE',
                      'NAME_INCOME_TYPE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE', 'TARGET']

    df = df_full.select(columns_extract)

    seed = 101
    train, test = df.randomSplit([0.8, 0.2], seed=seed)

    categorical_columns = ['NAME_EDUCATION_TYPE', 'CODE_GENDER', 'ORGANIZATION_TYPE', 'NAME_INCOME_TYPE']

    indexers = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep") for col in categorical_columns]

    for indexer in indexers:
        train = indexer.fit(train).transform(train)
        test = indexer.fit(test).transform(test)

    encoder = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded", dropLast=False) for col in categorical_columns]

    for enc in encoder:
        train = enc.fit(train).transform(train)
        test = enc.fit(test).transform(test)

    input_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                  'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',
                  'AMT_ANNUITY', 'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
                  'DAYS_LAST_PHONE_CHANGE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE'] + \
                 [col + "_encoded" for col in categorical_columns]

    assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
    train = assembler.setParams(handleInvalid="skip").transform(train)
    test = assembler.setParams(handleInvalid="skip").transform(test)

    train = train.select("TARGET", "features")
    test = test.select("TARGET", "features")

    random_forest = RandomForestClassifier(numTrees=100, featuresCol="features", labelCol="TARGET", seed=50)
    model = random_forest.fit(train)
    predictions = model.transform(test)

    evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy:", (accuracy*100))

    model.save("random_forest_model")

    spark.stop()

pipeline = Pipeline([
    node(run_pyspark_pipeline, inputs=["dummy_input"], outputs=["dummy_output"], name="run_pyspark_pipeline"),
])

from kedro.framework.context import KedroContext
context = KedroContext()
context.run(pipeline)


[32mFrom Kedro 0.19.0, the command `kedro new` will come with the option of interactively selecting add-ons for your project such as linting, testing, custom logging, and more. The selected add-ons will add the basic setup for the utilities selected to your projects.[0m

[1mProject Name
Please enter a human readable name for your new project.
Spaces, hyphens, and underscores are allowed.
 [1;36m(New Kedro Project)[0m: PBA3Q5

The project name 'PBA3Q5' has been applied to: 
- The project title in /content/pba3q5/README.md 
- The folder created for your project in /content/pba3q5 
- The project's python package in /content/pba3q5/src/pba3q5[0m

A best-practice setup includes initialising git and creating a virtual environment before running 'pip install -r src/requirements.txt' to install project-specific dependencies. Refer to the Kedro documentation: https://kedro.readthedocs.io/[0m
[32m
Change directory to the project generated in /content/pba3q5 by entering 'cd /content/pba3q

TypeError: ignored