In [0]:
# %pip install --quiet databricks-sdk==0.40.0 databricks-feature-engineering==0.8.0 mlflow==2.22.0
# dbutils.library.restartPython()

In [0]:
%run ../_resources/00-setup $reset_all_data=false

In [0]:
import seaborn as sns
import numpy as np
import pandas as pd
# import os
# import requests
# import json


## Data exploration and analysis

Let's review our dataset and start analyze the data we have to predict our churn

In [0]:
def plot(sensor_report):
  turbine_id = spark.table('turbine_training_dataset').where(f"abnormal_sensor = '{sensor_report}' ").limit(1).collect()[0]['turbine_id']
  #Let's explore a bit our datasets with pandas on spark.
  df = spark.table('sensor_bronze').where(f"turbine_id == '{turbine_id}' ").orderBy('timestamp').limit(500).pandas_api()
  df.plot(x="timestamp", y=["sensor_B"], kind="line", title=f'Sensor report: {sensor_report}').show()
plot('ok')

In [0]:
plot('sensor_B')

As we can see in these graph, we can clearly see some anomaly on the readings we get from sensor F. Let's continue our exploration and use the std we computed in our main feature table


In [0]:
# Read our churn_features table
turbine_dataset = spark.table('turbine_training_dataset').withColumn('damaged', col('abnormal_sensor') != 'ok')
display(turbine_dataset)

In [0]:

g = sns.PairGrid(turbine_dataset.sample(0.01).toPandas()[['std_sensor_A', 'std_sensor_E', 'damaged','avg_energy']], diag_sharey=False, hue="damaged")
g.map_lower(sns.kdeplot).map_diag(sns.kdeplot, lw=3).map_upper(sns.regplot).add_legend()

### Further data analysis and preparation using pandas API

Because our Data Scientist team is familiar with Pandas, we'll use `pandas on spark` to scale `pandas` code. The Pandas instructions will be converted in the spark engine under the hood and distributed at scale.

Typicaly Data Science project would involve more advanced preparation and likely require extra data prep step, including more complex feature preparation. We'll keep it simple for this demo.

*Note: Starting from `spark 3.2`, koalas is builtin and we can get an Pandas Dataframe using `pandas_api()`.*

In [0]:
 # Convert to pandas (koalas)
dataset = turbine_dataset.pandas_api()

# Select the columns we would like to use as ML Model features. #Note: we removed percentiles_sensor_A/B/C.. feature to make the demo easier
columns = [
    "turbine_id",
    "hourly_timestamp",
    "avg_energy",
    "std_sensor_A",
    "std_sensor_B",
    "std_sensor_C",
    "std_sensor_D",
    "std_sensor_E",
    "std_sensor_F",
    "location",
    "model",
    "state",
    "abnormal_sensor"
]
dataset = dataset[columns]

# Drop missing values
dataset = dataset.dropna()   
display(dataset)

In [0]:
spark.sql('drop table if exists turbine_hourly_features')

dataset.drop_duplicates(subset=['turbine_id', 'hourly_timestamp']).to_spark().write.saveAsTable(f'{catalog}.{db}.turbine_hourly_features')

In [0]:
df = spark.table(f'{catalog}.{db}.turbine_hourly_features')
display(df)