In [3]:
import pandas as pd
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
import configparser
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [13]:
def init_config():
    config = configparser.ConfigParser()
    config.read("../config.ini")
    return config

def download_blob(config):
    """Downloads a blob from the bucket."""
    
    # Initialize a storage client
    path = config['gcp']['raw_data']
    df = pd.read_csv(path)
    return df

config = init_config()
df = download_blob(config)

In [3]:
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = df[column].str.replace(' ', '_').str.lower()

In [4]:
numerical_columns = df.select_dtypes(include=['int64'])

In [5]:
def create_and_replace_bins(df, columns, num_bins=5):
    binned_df = df.copy()
    for col in columns:
        col_range = df[col].max() - df[col].min()
        if col_range > 5:
            # Calculate quantile-based bins
            _, bins = pd.qcut(df[col], q=num_bins, duplicates='drop', retbins=True)
            # Adjust bins to integer values
            bins = np.floor(bins).astype(int)
            bins[-1] = bins[-1] + 1  # Ensure the last bin is inclusive
            # Create labels based on bin edges
            labels = [f'{bins[i]}<=x<{bins[i+1]}' for i in range(len(bins)-1)]
            # Apply binning and replace original column
            binned_df[col] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
    return binned_df

# Apply dynamic binning and replace columns with integer bins where range is > 5
df = create_and_replace_bins(df, numerical_columns, num_bins=5)

In [6]:
def get_binned_value_counts(df, columns):
    value_counts_dict = {}
    for col in columns:
        value_counts_dict[col] = df[col].value_counts().to_dict()
    return value_counts_dict

# Get value counts for each binned column
get_binned_value_counts(df, numerical_columns)


{'Age': {'29<=x<34': 316,
  '45<=x<61': 314,
  '38<=x<45': 308,
  '34<=x<38': 274,
  '18<=x<29': 258},
 'DailyRate': {'391<=x<656': 296,
  '1224<=x<1500': 296,
  '942<=x<1224': 294,
  '102<=x<391': 292,
  '656<=x<942': 292},
 'DistanceFromHome': {'2<=x<5': 359,
  '9<=x<17': 318,
  '17<=x<30': 297,
  '5<=x<9': 288,
  '1<=x<2': 208},
 'Education': {3: 572, 4: 398, 2: 282, 1: 170, 5: 48},
 'EmployeeCount': {1: 1470},
 'EmployeeNumber': {'1654<=x<2069': 295,
  '1<=x<402': 294,
  '402<=x<814': 294,
  '1235<=x<1654': 294,
  '814<=x<1235': 293},
 'EnvironmentSatisfaction': {3: 453, 4: 446, 2: 287, 1: 284},
 'HourlyRate': {'73<=x<87': 310,
  '45<=x<59': 301,
  '87<=x<101': 300,
  '30<=x<45': 282,
  '59<=x<73': 277},
 'JobInvolvement': {3: 868, 2: 375, 4: 144, 1: 83},
 'JobLevel': {1: 543, 2: 534, 3: 218, 4: 106, 5: 69},
 'JobSatisfaction': {4: 459, 3: 442, 1: 289, 2: 280},
 'MonthlyIncome': {'2695<=x<4228': 295,
  '5743<=x<9860': 295,
  '9860<=x<20000': 294,
  '1009<=x<2695': 293,
  '4228<=x<5

In [7]:
df['Attrition'] = df['Attrition'].map(lambda x : 0 if x == 'no' else 1) 


In [8]:
df.select_dtypes(include=['object'])

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,travel_rarely,sales,life_sciences,female,sales_executive,single,y,yes
1,travel_frequently,research_&_development,life_sciences,male,research_scientist,married,y,no
2,travel_rarely,research_&_development,other,male,laboratory_technician,single,y,yes
3,travel_frequently,research_&_development,life_sciences,female,research_scientist,married,y,yes
4,travel_rarely,research_&_development,medical,male,laboratory_technician,married,y,no
...,...,...,...,...,...,...,...,...
1465,travel_frequently,research_&_development,medical,male,laboratory_technician,married,y,no
1466,travel_rarely,research_&_development,medical,male,healthcare_representative,married,y,no
1467,travel_rarely,research_&_development,life_sciences,male,manufacturing_director,married,y,yes
1468,travel_frequently,sales,medical,male,sales_executive,married,y,no


In [9]:
#I would get rid of Marital Status and Gender -- whether or not keeping these featurs is based on the context 
#In this case you want to ensure that decisions are not influenced by gender or marital status, hence promoting fairness and ethics 
# you need to check potential correlation with other features and the target variable

# Encoding categorical variables
df['Gender'] = df['Gender'].map({'male': 1, 'female': 0})
df['MaritalStatus'] = df['MaritalStatus'].map({'single': 0, 'married': 1, 'divorced': 2})

# Contingency tables
contingency_table_gender = pd.crosstab(df['Gender'], df['Attrition'])
contingency_table_marital = pd.crosstab(df['MaritalStatus'], df['Attrition'])

print("Contingency Table - Gender vs Target:")
print(contingency_table_gender)

print("\nContingency Table - Marital Status vs Target:")
print(contingency_table_marital)

# Chi-Square tests
chi2_gender, p_gender, dof_gender, ex_gender = chi2_contingency(contingency_table_gender)
chi2_marital, p_marital, dof_marital, ex_marital = chi2_contingency(contingency_table_marital)

print(f"\nChi-Square Test between gender and target: chi2 = {chi2_gender}, p-value = {p_gender}")
print(f"Chi-Square Test between marital_status and target: chi2 = {chi2_marital}, p-value = {p_marital}")

# Interpretation
alpha = 0.05
print("\nInterpretation:")
if p_gender < alpha:
    print("There is a significant association between gender and target (p < 0.05).")
else:
    print("There is no significant association between gender and target (p >= 0.05).")

if p_marital < alpha:
    print("There is a significant association between marital status and target (p < 0.05).")
else:
    print("There is no significant association between marital status and target (p >= 0.05).")

Contingency Table - Gender vs Target:
Attrition    0    1
Gender             
0          501   87
1          732  150

Contingency Table - Marital Status vs Target:
Attrition        0    1
MaritalStatus          
0              350  120
1              589   84
2              294   33

Chi-Square Test between gender and target: chi2 = 1.1169671241970975, p-value = 0.29057244902890855
Chi-Square Test between marital_status and target: chi2 = 46.163676540848705, p-value = 9.45551106034083e-11

Interpretation:
There is no significant association between gender and target (p >= 0.05).
There is a significant association between marital status and target (p < 0.05).


In [10]:
df.drop(columns=['Gender','StandardHours', 'Over18', 'EmployeeCount'], inplace=True)

In [11]:
# Business Travel

df['BusinessTravel'].value_counts()
# in this case we will use label encoding -- you should check the reason !! we could have choosen one-hot but we didn't ...
df['BusinessTravel'] = df['BusinessTravel'].apply(lambda x: 0 if x =='Non-Travel' else (1 if x == 'Travel_Rarely' else 2))

# Department

df['Department'].value_counts()
department_dummies = pd.get_dummies(df['Department'], prefix='Department', dtype=float) # why now we used one-hot encoding and not label encoding?
df.drop(columns=['Department'], inplace=True)
df = pd.concat([df,department_dummies], axis=1)

# Education Field and JobRole

for field in ['JobRole', 'EducationField', 'Age']:

    df[field].value_counts()
    lb = preprocessing.LabelBinarizer()
    new_data = lb.fit_transform(df[field])
    binary_df = pd.DataFrame(new_data, columns=lb.classes_)
    df.drop(columns=[field], inplace=True)
    pd.concat([df,binary_df], axis=1)


# OverTime

df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})

In [12]:
df.head()

Unnamed: 0,Attrition,BusinessTravel,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Department_human_resources,Department_research_&_development,Department_sales
0,1,2,942<=x<1224,1<=x<2,2,1<=x<402,2,87<=x<101,3,2,...,8<=x<10,0<=x<2,1,5<=x<7,4<=x<7,0<=x<1,4<=x<7,0.0,0.0,1.0
1,0,2,102<=x<391,5<=x<9,1,1<=x<402,3,59<=x<73,2,2,...,10<=x<17,3<=x<4,3,10<=x<41,7<=x<19,1<=x<4,7<=x<18,0.0,1.0,0.0
2,1,2,1224<=x<1500,2<=x<5,2,1<=x<402,4,87<=x<101,2,1,...,5<=x<8,3<=x<4,3,0<=x<2,0<=x<1,0<=x<1,0<=x<1,0.0,1.0,0.0
3,0,2,1224<=x<1500,2<=x<5,4,1<=x<402,4,45<=x<59,3,1,...,8<=x<10,3<=x<4,3,7<=x<10,7<=x<19,1<=x<4,0<=x<1,0.0,1.0,0.0
4,0,2,391<=x<656,2<=x<5,1,1<=x<402,1,30<=x<45,3,1,...,5<=x<8,3<=x<4,3,2<=x<5,2<=x<4,1<=x<4,2<=x<4,0.0,1.0,0.0


In [15]:
df.to_csv(config['gcp']['cloud_storage'], index=False)

In [4]:
def init_config():
    config = configparser.ConfigParser()
    config.read("../config.ini")
    return config

config = init_config()

In [6]:
# Will show you how to set up automl 

# First create a dataset 

import os

from google.cloud import aiplatform

PROJECT_ID = config['gcp']['project']
REGION = config['gcp']['region']

aiplatform.init(project=PROJECT_ID, location=REGION)

dataset = aiplatform.TabularDataset.create(
    display_name="HR Analytics",
    gcs_source=config['gcp']['cloud_storage'],
)

label_column = "Attrition"

print(dataset.resource_name)
     

Creating TabularDataset
Create TabularDataset backing LRO: projects/928543623034/locations/europe-west2/datasets/2285277743727771648/operations/1902197447250149376
TabularDataset created. Resource name: projects/928543623034/locations/europe-west2/datasets/2285277743727771648
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/928543623034/locations/europe-west2/datasets/2285277743727771648')
projects/928543623034/locations/europe-west2/datasets/2285277743727771648


In [7]:
job = aiplatform.AutoMLTabularTrainingJob(
  display_name="train-automl-hr-analytics",
  optimization_prediction_type="classification",
  optimization_objective="maximize-au-prc",
)

model = job.run(
    dataset=dataset,
    target_column=label_column,
    training_fraction_split=0.6,
    validation_fraction_split=0.2,
    test_fraction_split=0.2,
    budget_milli_node_hours=1000,
    model_display_name="test",
    disable_early_stopping=False,
)

No column transformations provided, so now retrieving columns from dataset in order to set default column transformations.
The column transformation of type 'auto' was set for the following columns: ['NumCompaniesWorked', 'YearsAtCompany', 'Department', 'MonthlyRate', 'Over18', 'EnvironmentSatisfaction', 'TrainingTimesLastYear', 'StandardHours', 'Age', 'WorkLifeBalance', 'MonthlyIncome', 'TotalWorkingYears', 'Education', 'EducationField', 'JobInvolvement', 'EmployeeCount', 'OverTime', 'EmployeeNumber', 'RelationshipSatisfaction', 'MaritalStatus', 'Gender', 'YearsInCurrentRole', 'DistanceFromHome', 'JobRole', 'PercentSalaryHike', 'YearsWithCurrManager', 'JobSatisfaction', 'DailyRate', 'StockOptionLevel', 'YearsSinceLastPromotion', 'PerformanceRating', 'HourlyRate', 'BusinessTravel', 'JobLevel'].
View Training:
https://console.cloud.google.com/ai/platform/locations/europe-west2/training/437982485465268224?project=928543623034
AutoMLTabularTrainingJob projects/928543623034/locations/europ

In [8]:
endpoint = model.deploy(
    machine_type='n1-standard-4'
)

Creating Endpoint
Create Endpoint backing LRO: projects/928543623034/locations/europe-west2/endpoints/1914913848981192704/operations/6745818846487117824
Endpoint created. Resource name: projects/928543623034/locations/europe-west2/endpoints/1914913848981192704
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/928543623034/locations/europe-west2/endpoints/1914913848981192704')
Deploying model to Endpoint : projects/928543623034/locations/europe-west2/endpoints/1914913848981192704
Deploy Endpoint model backing LRO: projects/928543623034/locations/europe-west2/endpoints/1914913848981192704/operations/4439975837273423872
Endpoint model deployed. Resource name: projects/928543623034/locations/europe-west2/endpoints/1914913848981192704


In [15]:
instances =[
    {
        'Age': '35',
        'Attrition': 'No',
        'BusinessTravel': 'Travel_Rarely',
        'DailyRate': '1102',
        'Department': 'Sales',
        'DistanceFromHome': '1',
        'Education': '2',
        'EducationField': 'Life Sciences',
        'EmployeeCount': '1',
        'EmployeeNumber': '1',
        'EnvironmentSatisfaction': '3',
        'Gender': 'Male',
        'HourlyRate': '94',
        'JobInvolvement': '3',
        'JobLevel': '2',
        'JobRole': 'Sales Executive',
        'JobSatisfaction': '4',
        'MaritalStatus': 'Married',
        'MonthlyIncome': '5000',
        'MonthlyRate': '20000',
        'NumCompaniesWorked': '2',
        'Over18': 'Y',
        'OverTime': 'Yes',
        'PercentSalaryHike': '15',
        'PerformanceRating': '3',
        'RelationshipSatisfaction': '2',
        'StandardHours': '80',
        'StockOptionLevel': '1',
        'TotalWorkingYears': '10',
        'TrainingTimesLastYear': '3',
        'WorkLifeBalance': '3',
        'YearsAtCompany': '5',
        'YearsInCurrentRole': '4',
        'YearsSinceLastPromotion': '2',
        'YearsWithCurrManager': '3'
    },
    # Add more instances as needed
]
predictions = endpoint.predict(instances=instances)

In [16]:
predictions

Prediction(predictions=[{'classes': ['No', 'Yes'], 'scores': [0.9832810163497925, 0.01671898551285267]}], deployed_model_id='4374178312688238592', metadata=None, model_version_id='1', model_resource_name='projects/928543623034/locations/europe-west2/models/1256002918734102528', explanations=None)

In [None]:
model_evaluation = model.list_model_evaluations()[0]
