## Absenteeism at Work

Dataset Source: https://www.kaggle.com/datasets/kewagbln/absenteeism-at-work-uci-ml-repositiory

##### Import Necessary Libraries

In [0]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, DoubleType, FloatType, ArrayType

from pyspark.ml import Pipeline

from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

##### Versions of Libraries, Modules, Frameworks Used in This Project

In [0]:
print("Apache Spark version:", spark.version)

Apache Spark version: 3.3.1


#### Create Functions Used Throughout This Project

##### Create Function to Ingest Dataset

In [0]:
def ingest_dataset(file_location: str, \
                   schema: StructType \
                  ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This function returns a dataset aligned with the schema
    '''
    file_type = "csv"
    infer_schema = "false"
    first_row_is_header = "true"
    delimiter = ";"
    
    df = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .schema(schema)\
      .load(file_location)
    
    return df

##### Prepare Dataset for Analysis

In [0]:
def prepare_data(dataset: pyspark.sql.dataframe.DataFrame, \
                         categorical_columns: [str], \
                         numerical_columns: [str], \
                         label = "label" \
                ) -> {}:
    '''
    This function creates the pipeline stages for classification analysis.
    '''
    from pyspark.ml.feature import IndexToString, OneHotEncoder, StringIndexer, VectorAssembler
    # Define variables used within function
    stages = []
        
    # Prepare categorical features
    for cat in categorical_columns:
        stringIndexer = StringIndexer(inputCol = cat, outputCol = cat + "_index")
        encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],\
                                outputCols=[cat + "_class_vec"])
        stages += [stringIndexer]
        stages += [encoder]
    
    # Use the vector assembler
    assembler_inputs = [c + "_class_vec" for c in categorical_columns] + numerical_columns
    assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
    stages += [assembler]
    
    # using the pipeline created above, prepare the data for the model
    data_pipe = Pipeline().setStages(stages)
    data_fitted = data_pipe.fit(dataset)
    prepped_ds = data_fitted.transform(dataset)
    
    return prepped_ds

##### Create Metrics Evaluation Function

In [0]:
def regression_evaluation_spark(dataset: pyspark.sql.dataframe.DataFrame, \
                                metrics: [str], \
                                model_name: str, \
                                label_col: str = "label", \
                                predictionCol: str = "prediction" \
                               ) -> None:
    '''
    Calculate & display metrics for a multiclass classification analysis.
    '''
    print("+---------------------------------------------+")
    print("|  " + model_name.center(41) + "  |")
    print("+---------------------------------------------+")
    print("|   %s  |  %s   |" % ("Metric".rjust(20), "Value".ljust(14)))
    print("+---------------------------------------------+")
    for x in metrics:
        evaluator = RegressionEvaluator(labelCol=label_col, \
                                                      predictionCol=predictionCol, \
                                                      metricName=x) 
        score = evaluator.evaluate(dataset)
        print("|   %s  |  %s   |" % (x.rjust(20), str(round(score, 6)).ljust(14)))
        print("+---------------------------------------------+")

#### Ingest & Preprocess Dataset

##### Ingest Dataset

In [0]:
orig_schema = StructType([
    StructField("id", StringType(), True),
    StructField("reason_for_absence", IntegerType(), True),
    StructField("month_of_absence", IntegerType(), True),
    StructField("day_of_week", IntegerType(), True),
    StructField("seasons", IntegerType(), True),
    StructField("transportation_expense", IntegerType(), True),
    StructField("home_to_work_distance", IntegerType(), True),
    StructField("service_time", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("daily_workload", DoubleType(), True),
    StructField("hit_target", IntegerType(), True),
    StructField("disciplinary_failure", IntegerType(), True),
    StructField("edu", IntegerType(), True),
    StructField("number_of_kids", IntegerType(), True),
    StructField("social_drinker", IntegerType(), True),
    StructField("social_smoker", IntegerType(), True),
    StructField("num_of_pets", IntegerType(), True),
    StructField("weight", IntegerType(), True),
    StructField("height", IntegerType(), True),
    StructField("bmi", IntegerType(), True),
    StructField("label", IntegerType(), True)
])

data_file = "/FileStore/tables/Absenteeism_at_work.csv"

df = ingest_dataset(data_file, orig_schema)

df = df.drop("id")

display(df)

reason_for_absence,month_of_absence,day_of_week,seasons,transportation_expense,home_to_work_distance,service_time,age,daily_workload,hit_target,disciplinary_failure,edu,number_of_kids,social_drinker,social_smoker,num_of_pets,weight,height,bmi,label
26,7,3,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,4
0,7,3,1,118,13,18,50,239.554,97,1,1,1,1,0,0,98,178,31,0
23,7,4,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
7,7,5,1,279,5,14,39,239.554,97,0,1,2,1,1,0,68,168,24,4
23,7,5,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,2
23,7,6,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
22,7,6,1,361,52,3,28,239.554,97,0,1,1,1,0,4,80,172,27,8
23,7,6,1,260,50,11,36,239.554,97,0,1,4,1,0,0,65,168,23,4
19,7,2,1,155,12,14,34,239.554,97,0,1,2,1,0,0,95,196,25,40
22,7,2,1,235,11,14,37,239.554,97,0,3,1,0,0,1,88,172,29,8


##### Categorize Features for Pipeline

In [0]:
# Numerical Features
numerical = ["transportation_expense", "home_to_work_distance", "service_time", "age", "daily_workload", "hit_target", "weight", "height", "bmi"]

# Categorical Features
categories = [x for x in df.columns if x not in numerical]

##### Prepare Dataset for Gradient Boosted Tree Regressor Algorithm

In [0]:
prepped_ds = prepare_data(df, \
                         categories, \
                         numerical, \
                         label = "label")

##### Split Training Dataset into Training & Testing Datasets

In [0]:
train_ds, test_ds = prepped_ds.randomSplit(weights=[0.80, 0.20], seed=42)

train_ds = train_ds.persist()
test_ds = test_ds.persist()

print(f"There are {train_ds.count()} samples in the training dataset.")
print(f"There are {test_ds.count()} samples in the testing dataset.")

There are 622 samples in the training dataset.
There are 118 samples in the testing dataset.


##### Instantiate Instance of Gradient Boosted Tree Regressor Algorithm

In [0]:
gbtr = GBTRegressor(lossType='squared', seed=42, maxIter=100, stepSize=0.01, maxDepth=8)

##### Fit/Train Model Using Training Dataset

In [0]:
gbtr_model = gbtr.fit(train_ds)

##### Generate Predictions Using Testing Dataset

In [0]:
predictions = gbtr_model.transform(test_ds)

##### Prepare Predictions For Metrics Evaluation Function

In [0]:
preds = predictions.select(F.col('label'), F.col('prediction'))

preds = preds.persist()

metrics_to_eval = ["rmse", "mse", "mae"]

regression_evaluation_spark(preds, \
                                metrics_to_eval, \
                                "Absenteeism @ Work")

+---------------------------------------------+
|              Absenteeism @ Work             |
+---------------------------------------------+
|                 Metric  |  Value            |
+---------------------------------------------+
|                   rmse  |  0.788754         |
+---------------------------------------------+
|                    mse  |  0.622133         |
+---------------------------------------------+
|                    mae  |  0.456806         |
+---------------------------------------------+


##### End Spark Session

In [0]:
train_ds = train_ds.unpersist()
test_ds = test_ds.unpersist()
preds = preds.unpersist()

spark.stop()

### Notes & Other Takeaways From This Project
****
- Even though the model has minimized the loss to under an hour (30-48 minutes), I would not run this project in production yet. My biggest concern is overfitting. This is because the sample size for this dataset is smaller and I (admittedly) tuned the parameters a bit aggressively. 
****
- Unfortunately, the GBT Regressor in PySpark does not support a training summary (like Logistic Regression). If I had the information and metrics from the training summary and could see that the training and testing metrics were similar, it would inspire more confidence in the current iteration of this project!
****