In [None]:
import os

# Set up service principle access to azure (used by datascience_core)
SCOPE = "data_science"
sp_client_id = dbutils.secrets.get(scope=SCOPE, key="sp_client_id")
sp_tenant_id = dbutils.secrets.get(scope=SCOPE, key="sp_tenant_id")
sp_secret_key = dbutils.secrets.get(scope=SCOPE, key="sp_secret_key")
os.environ["AZURE_CLIENT_ID"] = sp_client_id
os.environ["AZURE_CLIENT_SECRET"] = sp_secret_key
os.environ["AZURE_TENANT_ID"] = sp_tenant_id
!export AZURE_CLIENT_ID=$sp_client_id
!export AZURE_CLIENT_SECRET=$sp_secret_key
!export AZURE_TENANT_ID=$sp_tenant_id

# service_principal_clientId="aa9c33c7-c449-4842-8045-52c07ebbdc97"
# # service_principal_secret = "Q~H7Q~t9s.N-nPco4sRpDWMsyaQwSZtLf.Vu0"
# # service_principal_tenantId="736f9f09-0fa9-4930-86b0-bc4e9631f407"

storage_account = "ds247dldev"
directory = "raw/CRA"

spark.conf.set(
    "fs.azure.account.auth.type." + storage_account + ".dfs.core.windows.net", "OAuth"
)
spark.conf.set(
    "fs.azure.account.oauth.provider.type." + storage_account + ".dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
)
spark.conf.set(
    "fs.azure.account.oauth2.client.id." + storage_account + ".dfs.core.windows.net",
    sp_client_id,
)
spark.conf.set(
    "fs.azure.account.oauth2.client.secret."
    + storage_account
    + ".dfs.core.windows.net",
    sp_secret_key,
)
spark.conf.set(
    "fs.azure.account.oauth2.client.endpoint."
    + storage_account
    + ".dfs.core.windows.net",
    "https://login.microsoftonline.com/" + sp_tenant_id + "/oauth2/token",
)

In [None]:
import pandas as pd
from datascience_core.data_retrieval import ProjectDatasetManager
import functools
from pyspark.sql.functions import lit
from functools import reduce
from pyspark.sql import DataFrame

In [None]:
!pip install fsspec

In [None]:
# load in the new file and change the application id column name to match the retro file

In [None]:
Afilliate_supression_files = dbutils.fs.ls(
    "abfss://projects@"
    + storage_account
    + ".dfs.core.windows.net/datascience_affiliate_suppression/20230516_DS_Export_6months.csv"
)
Afilliate_supression_files = spark.read.option("header", True).csv(
    Afilliate_supression_files[0].path
)
Afilliate_supression_files = Afilliate_supression_files.withColumnRenamed(
    "LoanApplicationId", "ApplicationId"
)

In [None]:
# load in the retro file and change the application id column name to match the new file

In [None]:
df_all_epochs = spark.read.option("header", True).csv(
    "abfss://raw@" + storage_account + ".dfs.core.windows.net/CRA"
)
df_all_epochs = df_all_epochs.withColumnRenamed("App.ApplicationId", "ApplicationId")

In [None]:
# join the two files together

In [None]:
df_complete = df_all_epochs.join(
    Afilliate_supression_files, on="ApplicationId", how="inner"
)
df_out = df_complete.toPandas()

In [None]:
# save the file to the project

In [None]:
manager = ProjectDatasetManager("affiliate_suppression")
manager.register_dataset(
    "affiliate_suppression_May-23_6months_eda_with_retro",
    df_out,
    "affiliate_suppression training data with retro to enable scorecard and prime prediction assistance",
    register_as_pickle=False,
)