In [55]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [56]:
# Project Hadron
from ds_discovery import SyntheticBuilder, Transition, Wrangle, Controller, Commons

In [57]:
# Data Science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Transition

In [79]:
tr = Transition.from_env("telco_churn", has_contract=False)
tr.set_source_uri('s3://lk-datasets/kaggle/WA_Fn-UseC_-Telco-Customer-Churn.csv')
tr.set_persist()
tr.set_description("Telco Churn Dataset")

In [80]:
df = tr.load_source_canonical()

In [81]:
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attributes (21),dType,%_Null,%_Dom,Count,Unique,Observations
0,Churn,object,0.0,0.735,7043,2,Sample: No | Yes
1,Contract,object,0.0,0.55,7043,3,Sample: Month-to-month | Two year | One year
2,Dependents,object,0.0,0.7,7043,2,Sample: No | Yes
3,DeviceProtection,object,0.0,0.439,7043,3,Sample: No | Yes | No internet service
4,InternetService,object,0.0,0.44,7043,3,Sample: Fiber optic | DSL | No
5,MonthlyCharges,float64,0.0,0.009,7043,1585,max=118.75 | min=18.25 | mean=64.76 | dominant=20.05
6,MultipleLines,object,0.0,0.481,7043,3,Sample: No | Yes | No phone service
7,OnlineBackup,object,0.0,0.438,7043,3,Sample: No | Yes | No internet service
8,OnlineSecurity,object,0.0,0.497,7043,3,Sample: No | Yes | No internet service
9,PaperlessBilling,object,0.0,0.592,7043,2,Sample: Yes | No


In [82]:
# select and type from observations
df = tr.tools.auto_clean_header(df, rename_map={'customerID': 'CustomerID', 'gender': 'Gender', 'tenure': 'Tenure'})
df = tr.tools.to_float_type(df, headers='TotalCharges')

In [84]:
# turn yes/no to bool to guarantee 0/1 distribution
yn = []
for col in df.columns:
    if df[col].nunique() == 2 and 'Yes' in df[col].unique().tolist():
        yn.append(col)
df = tr.tools.to_bool_type(df, headers=yn, bool_map={'Yes': True})

In [85]:
# run the pipeline
tr.run_component_pipeline()

-----------------
### Synthetic

In [86]:
builder = SyntheticBuilder.from_env("telco_churn", has_contract=False)

In [87]:
# set the output
builder.set_persist()
# point to the clean sample
builder.add_connector_uri('clean_sample', uri=tr.from_env("telco_churn").get_persist_contract().uri)
sample = builder.load_canonical('clean_sample')

In [88]:
# target
y = sample['Churn']
# features
X = sample.drop(['Churn', 'CustomerID'], axis=1)

In [122]:
# synthesis the sample
size = 1000
a ={}
for c in X.columns:
    a.update({c: {}})
    
# churn is the target
df = builder.tools.model_analysis(size, other='clean_sample', columns_list=[{'Churn':{}},a], column_name='analysis')
df.shape

(1000, 20)

In [123]:
# rebuild the customer_id as a unique reference
df['CustomerID'] = builder.tools.get_number(from_value=1000000, to_value=9999999, at_most=1, size=size, column_name='CustomerID')

In [124]:
# run the pipeline with 10,000 rows
builder.run_component_pipeline(canonical=10000)

## Feature engineering

In [125]:
wr = Wrangle.from_env("telco_churn", has_contract=False)

In [126]:
wr.set_source_uri(uri=builder.from_env("telco_churn").get_persist_contract().uri)
wr.set_persist()

In [127]:
df = wr.load_source_canonical()

In [128]:
# predictors
X = df.drop(['Churn', 'CustomerID'], axis=1)
# target
y = df['Churn']

In [129]:
y.head()

0    False
1    False
2    False
3    False
4     True
Name: Churn, dtype: bool

In [139]:
# convert all categoricals to one-hot from k-1
# Warning: increase of dimentionality
cats = Commons.filter_headers(X, dtype='object')
df = wr.tools.model_encode_one_hot(df, headers=cats, drop_first=True, column_name='one_hot_k-1')

In [140]:
df.head()

Unnamed: 0,CustomerID,Churn,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,1615599,False,False,False,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
1,9308799,False,False,False,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
2,2783349,False,False,False,False,True,False,False,True,False,...,True,True,True,True,True,False,False,False,False,False
3,7512399,False,False,False,False,True,False,True,True,False,...,True,True,True,True,True,False,False,False,False,False
4,5753349,True,False,False,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False


In [142]:
# normalise all floats
for c in Commons.filter_headers(X, dtype='float'):
    df[c] = wr.tools.correlate_numbers(df, header=c, normalize=(-1.,1.), column_name=f'{c}_norm')

In [132]:
wr.canonical_report(df, stylise=False)

Unnamed: 0,Attributes (21),dType,%_Null,%_Dom,Count,Unique,Observations
0,Churn,bool,0.0,0.732,10000,2,False | True
1,Contract,bool,0.0,1.0,10000,1,False
2,CustomerID,int64,0.0,0.0,10000,10000,"max=9999549 | min=1000449 | mean=5480465.67 | dominant=[1000449, 1000899]"
3,Dependents,bool,0.0,0.7,10000,2,False | True
4,DeviceProtection,bool,0.0,0.656,10000,2,False | True
5,Gender,bool,0.0,1.0,10000,1,False
6,InternetService,bool,0.0,1.0,10000,1,False
7,MonthlyCharges,bool,0.0,1.0,10000,1,False
8,MultipleLines,bool,0.0,0.582,10000,2,False | True
9,OnlineBackup,bool,0.0,0.655,10000,2,False | True


## Controller

In [133]:
controller = Controller.from_env(has_contract=False)

In [134]:
# register the components
controller.intent_model.transition(canonical=0, task_name="telco_churn", intent_level='telco_churn_tr')
controller.intent_model.synthetic_builder(canonical=0, task_name="telco_churn", intent_level='telco_churn_sb')
controller.intent_model.wrangle(canonical=0, task_name="telco_churn", intent_level='telco_churn_wr')

In [135]:
controller.report_tasks()

Unnamed: 0,level,order,component,task,parameters,creator
0,telco_churn_sb,0,SyntheticBuilder,'telco_churn',[],lkrishna
1,telco_churn_tr,0,Transition,'telco_churn',[],lkrishna
2,telco_churn_wr,0,Wrangle,'telco_churn',[],lkrishna


In [136]:
# run the controller and load the run report
controller.run_controller(run_cycle_report='cycle_report.csv')
controller.load_canonical(connector_name='run_cycle_report')

Unnamed: 0,time,text
0,2022-12-14 15:18:37.133788,start run-cycle 0
1,2022-12-14 15:18:37.134279,start task cycle 0
2,2022-12-14 15:18:37.135034,running telco_churn_tr
3,2022-12-14 15:18:39.129411,"canonical shape is (7043, 21)"
4,2022-12-14 15:18:39.130494,running telco_churn_sb
5,2022-12-14 15:18:39.181718,"canonical shape is (1000, 21)"
6,2022-12-14 15:18:39.182548,running telco_churn_wr
7,2022-12-14 15:18:39.188143,"canonical shape is (1000, 21)"
8,2022-12-14 15:18:39.188711,tasks complete
9,2022-12-14 15:18:39.189168,end of report


In [116]:
Wrangle.from_env("telco_churn").get_persist_contract().uri

'./hadron/data/hadron_wrangle_telco_churn_primary_persist.pickle'