# Cloud Motivation

Let's take a quick look at the case we're about to simulate in the cloud. We already know the data that we see now. In reality, however, we do not know the target variable, of course. 

Let's say the target variable specifies a churn probability. Depending on the churn probability of a model, an incentive is to be stored for each customer.

For this we will:
* Read three data sources.
* Join them.
* Encode the categorical variable *region*
* Load and apply the model
* Apply the business logic based on the prediction of the model. 

## Utils & Local Spark

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import DataFrame
from pyspark.ml.base import Transformer
import pyspark.sql.functions as F
from pyspark.ml.functions import vector_to_array


FEATURES = ['n1', 'n2', 'n3', 'n4_1', 'n4_2', 'n4_3']
TARGET = 'target'


def merge_dataframes(df_customer_usage: DataFrame,
                     df_customer_master: DataFrame,
                     df_demographics: DataFrame) -> DataFrame:
    """ Merge dataframes """
    df = df_customer_master.join(df_customer_usage, on="customer_id")
    df = df.join(df_demographics, on="postal_code")
    return df


def one_hot_encode_region(df: DataFrame) -> DataFrame:
    """ One hot encode region """
    df = df.withColumn("n4_1", F.when(F.col("region") == F.lit("City"), 1).otherwise(0))
    df = df.withColumn("n4_2", F.when(F.col("region") == F.lit("Country"), 1).otherwise(0))
    df = df.withColumn("n4_3", F.when(F.col("region") == F.lit("Intermediate Area"), 1).otherwise(0))
    return df


def preprocess_data(df_customer_usage: DataFrame,
                    df_customer_master: DataFrame,
                    df_demographics: DataFrame) -> DataFrame:
    """ Preprocess data """
    df = merge_dataframes(df_customer_usage, df_customer_master, df_demographics)
    df = one_hot_encode_region(df)
    return df


def train_random_forest_classifier(df: DataFrame) -> Transformer:
    """ Train random forest classifier """

    assembler = VectorAssembler(inputCols=FEATURES, outputCol='features')
    train_data = assembler.transform(df).select('features', TARGET)

    clf = RandomForestClassifier(featuresCol='features', labelCol=TARGET, numTrees=100, maxDepth=2, seed=42)
    model = clf.fit(train_data)

    return model


def apply_proba(clf: Transformer, df: DataFrame) -> DataFrame:
    """ Apply probability 
    Note that the transformer predicts the probability for each class, resulting in a vector of length 2.
    This vector is of type VectorUDT, which is not supported by Spark SQL.
    Therefore, we need to convert it to an array of doubles.
    """

    assembler = VectorAssembler(inputCols=FEATURES, outputCol='features')
    df = assembler.transform(df)
    df = clf.transform(df)

    df = df.withColumn("probability_vec", vector_to_array(F.col("probability")))
    df = df.withColumn("proba", F.element_at(F.col("probability_vec"), 2))
    
    return df

In [19]:
spark_session = SparkSession.builder.appName("BusinessCase").getOrCreate()

In [20]:
spark_session.conf.set("spark.sql.repl.eagerEval.enabled", True)

## Load data

In [33]:
df_customer_master = spark_session.read.parquet("data/customer_master_data_wo_target")
df_customer_usage = spark_session.read.parquet("data/customer_usage_data")
df_demographics = spark_session.read.parquet("data/demographics")

In [34]:
df_customer_master

customer_id,postal_code
4974467801682041986,1970
-858641559787057159,1645
7875860926956384571,7684
1718663060827327339,3355
-8637874552457225727,4344
-3177612884997717707,3784
8497217710787736490,5538
4292528309378731798,1749
3264293750177743940,7445
3343463163369573891,6479


In [35]:
df_customer_usage

customer_id,n1,n2,n3
4974467801682041986,1.0389522417417338,0.1630470071682174,-0.0716189312696505
-858641559787057159,-0.2659940172448812,0.750317496703657,-0.8372525523443803
7875860926956384571,1.3367555579432162,0.262499759615919,1.542938718088682
1718663060827327339,0.5151084439689695,-0.7996961849347815,2.175468067045149
-8637874552457225727,1.6956467653837362,-0.945521641217198,-0.8639522191312855
-3177612884997717707,0.9220421897516056,1.550548930005398,0.6383819989459883
8497217710787736490,0.0581497703921297,1.1856903392169684,0.8213706052182598
4292528309378731798,0.6513284963527268,0.3168165631959955,0.6160836933665816
3264293750177743940,0.1847085424540692,-0.2802940363082347,-0.7622539473498987
3343463163369573891,-1.8851460177226464,-0.2248136997426025,-0.6219523743600517


In [36]:
df_demographics

postal_code,region
1,City
2,Intermediate Area
3,Country
4,Country
5,Country
6,City
7,Country
8,City
9,Intermediate Area
10,City


## Preprocess

* Join
* One Hot Encode *region*

In [37]:
df = preprocess_data(df_customer_usage, df_customer_master, df_demographics)

In [38]:
df

postal_code,customer_id,n1,n2,n3,region,n4_1,n4_2,n4_3
1970,4974467801682041986,1.0389522417417338,0.1630470071682174,-0.0716189312696505,Intermediate Area,0,0,1
1645,-858641559787057159,-0.2659940172448812,0.750317496703657,-0.8372525523443803,Country,0,1,0
7684,7875860926956384571,1.3367555579432162,0.262499759615919,1.542938718088682,City,1,0,0
3355,1718663060827327339,0.5151084439689695,-0.7996961849347815,2.175468067045149,Country,0,1,0
4344,-8637874552457225727,1.6956467653837362,-0.945521641217198,-0.8639522191312855,City,1,0,0
3784,-3177612884997717707,0.9220421897516056,1.550548930005398,0.6383819989459883,Country,0,1,0
5538,8497217710787736490,0.0581497703921297,1.1856903392169684,0.8213706052182598,City,1,0,0
1749,4292528309378731798,0.6513284963527268,0.3168165631959955,0.6160836933665816,City,1,0,0
7445,3264293750177743940,0.1847085424540692,-0.2802940363082347,-0.7622539473498987,Intermediate Area,0,0,1
6479,3343463163369573891,-1.8851460177226464,-0.2248136997426025,-0.6219523743600517,City,1,0,0


## Apply Model

In [39]:
model = RandomForestClassificationModel.load("data/rf_classifier_model")
df = apply_proba(model, df)

In [40]:
df

postal_code,customer_id,n1,n2,n3,region,n4_1,n4_2,n4_3,features,rawPrediction,probability,prediction,probability_vec,proba
1970,4974467801682041986,1.0389522417417338,0.1630470071682174,-0.0716189312696505,Intermediate Area,0,0,1,[1.03895224174173...,[82.3269009255007...,[0.82326900925500...,0.0,[0.82326900925500...,0.1767309907449923
1645,-858641559787057159,-0.2659940172448812,0.750317496703657,-0.8372525523443803,Country,0,1,0,[-0.2659940172448...,[81.8584167550731...,[0.81858416755073...,0.0,[0.81858416755073...,0.181415832449269
7684,7875860926956384571,1.3367555579432162,0.262499759615919,1.542938718088682,City,1,0,0,[1.33675555794321...,[70.1208298460789...,[0.70120829846078...,0.0,[0.70120829846078...,0.2987917015392107
3355,1718663060827327339,0.5151084439689695,-0.7996961849347815,2.175468067045149,Country,0,1,0,[0.51510844396896...,[55.7731212691989...,[0.55773121269198...,0.0,[0.55773121269198...,0.4422687873080104
4344,-8637874552457225727,1.6956467653837362,-0.945521641217198,-0.8639522191312855,City,1,0,0,[1.69564676538373...,[80.8287308350542...,[0.80828730835054...,0.0,[0.80828730835054...,0.1917126916494575
3784,-3177612884997717707,0.9220421897516056,1.550548930005398,0.6383819989459883,Country,0,1,0,[0.92204218975160...,[77.2927938999676...,[0.77292793899967...,0.0,[0.77292793899967...,0.2270720610003229
5538,8497217710787736490,0.0581497703921297,1.1856903392169684,0.8213706052182598,City,1,0,0,[0.05814977039212...,[73.5885434938090...,[0.73588543493809...,0.0,[0.73588543493809...,0.2641145650619096
1749,4292528309378731798,0.6513284963527268,0.3168165631959955,0.6160836933665816,City,1,0,0,[0.65132849635272...,[75.7775554502451...,[0.75777555450245...,0.0,[0.75777555450245...,0.2422244454975486
7445,3264293750177743940,0.1847085424540692,-0.2802940363082347,-0.7622539473498987,Intermediate Area,0,0,1,[0.18470854245406...,[81.6230454739033...,[0.81623045473903...,0.0,[0.81623045473903...,0.1837695452609662
6479,3343463163369573891,-1.8851460177226464,-0.2248136997426025,-0.6219523743600517,City,1,0,0,[-1.8851460177226...,[76.7469001663272...,[0.76746900166327...,0.0,[0.76746900166327...,0.232530998336728


## Business Layer

* Apply a business logic, based on the propabiliites.

In [41]:
def apply_business_logic(df: DataFrame) -> DataFrame:
    """ Apply business logic """
    df = df.withColumn("incentive", F.lit("fallback"))
    df = df.withColumn("incentive", F.when(F.col("proba") >= F.lit(0.1), "v01").otherwise(F.col("incentive")))
    df = df.withColumn("incentive", F.when(F.col("proba") >= F.lit(0.2), "v02").otherwise(F.col("incentive")))
    df = df.withColumn("incentive", F.when(F.col("proba") >= F.lit(0.3), "v03").otherwise(F.col("incentive")))
    df = df.withColumn("incentive", F.when(F.col("proba") >= F.lit(0.4), "v04").otherwise(F.col("incentive")))
    df = df.withColumn("incentive", F.when(F.col("proba") >= F.lit(0.5), "v05").otherwise(F.col("incentive")))
    return df

In [42]:
df = apply_business_logic(df)

In [43]:
df

postal_code,customer_id,n1,n2,n3,region,n4_1,n4_2,n4_3,features,rawPrediction,probability,prediction,probability_vec,proba,incentive
1970,4974467801682041986,1.0389522417417338,0.1630470071682174,-0.0716189312696505,Intermediate Area,0,0,1,[1.03895224174173...,[82.3269009255007...,[0.82326900925500...,0.0,[0.82326900925500...,0.1767309907449923,v01
1645,-858641559787057159,-0.2659940172448812,0.750317496703657,-0.8372525523443803,Country,0,1,0,[-0.2659940172448...,[81.8584167550731...,[0.81858416755073...,0.0,[0.81858416755073...,0.181415832449269,v01
7684,7875860926956384571,1.3367555579432162,0.262499759615919,1.542938718088682,City,1,0,0,[1.33675555794321...,[70.1208298460789...,[0.70120829846078...,0.0,[0.70120829846078...,0.2987917015392107,v02
3355,1718663060827327339,0.5151084439689695,-0.7996961849347815,2.175468067045149,Country,0,1,0,[0.51510844396896...,[55.7731212691989...,[0.55773121269198...,0.0,[0.55773121269198...,0.4422687873080104,v04
4344,-8637874552457225727,1.6956467653837362,-0.945521641217198,-0.8639522191312855,City,1,0,0,[1.69564676538373...,[80.8287308350542...,[0.80828730835054...,0.0,[0.80828730835054...,0.1917126916494575,v01
3784,-3177612884997717707,0.9220421897516056,1.550548930005398,0.6383819989459883,Country,0,1,0,[0.92204218975160...,[77.2927938999676...,[0.77292793899967...,0.0,[0.77292793899967...,0.2270720610003229,v02
5538,8497217710787736490,0.0581497703921297,1.1856903392169684,0.8213706052182598,City,1,0,0,[0.05814977039212...,[73.5885434938090...,[0.73588543493809...,0.0,[0.73588543493809...,0.2641145650619096,v02
1749,4292528309378731798,0.6513284963527268,0.3168165631959955,0.6160836933665816,City,1,0,0,[0.65132849635272...,[75.7775554502451...,[0.75777555450245...,0.0,[0.75777555450245...,0.2422244454975486,v02
7445,3264293750177743940,0.1847085424540692,-0.2802940363082347,-0.7622539473498987,Intermediate Area,0,0,1,[0.18470854245406...,[81.6230454739033...,[0.81623045473903...,0.0,[0.81623045473903...,0.1837695452609662,v01
6479,3343463163369573891,-1.8851460177226464,-0.2248136997426025,-0.6219523743600517,City,1,0,0,[-1.8851460177226...,[76.7469001663272...,[0.76746900166327...,0.0,[0.76746900166327...,0.232530998336728,v02


# Let's move to cloud.

So let's do this, but on a real cluster.

<div style="text-align:center">
<img src="img/s3_glue.drawio.png">
</div>

In [44]:
spark_session.stop()