In [204]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [205]:
df = pd.read_csv('./data/preprocessed_train_fin_two.csv')
df.head()

Unnamed: 0,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,AS,0.066667,32160,End-Customer,Enterprise,0,0,0,0,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,AS,0.066667,23122,End-Customer,Enterprise,12,0,0,0,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,AS,0.088889,1755,End-Customer,Enterprise,144,0,0,0,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,AS,0.088889,4919,End-Customer,Enterprise,0,0,0,0,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,AS,0.088889,17126,Specifier/ Influencer,Enterprise,0,0,0,0,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


In [206]:
print(len(df.columns))
df.columns

27


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'enterprise', 'historical_existing_cnt',
       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'customer_job', 'lead_desc_length', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'expected_timeline',
       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu',
       'business_area', 'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

In [207]:
# categorical columns

for col in df.columns:
    if df[col].dtype != 'int64' and df[col].dtype != 'float64':
        print(col)

business_unit
customer_type
enterprise
customer_job
product_category
product_subcategory
product_modelname
customer_country.1
customer_position
response_corporate
expected_timeline
business_area
business_subarea
is_converted


In [208]:
#numerical columns

for col in df.columns:
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        print(col)

bant_submit
com_reg_ver_win_rate
customer_idx
historical_existing_cnt
id_strategic_ver
it_strategic_ver
idit_strategic_ver
lead_desc_length
ver_cus
ver_pro
ver_win_rate_x
ver_win_ratio_per_bu
lead_owner


## customer_country.1

### 5 fold Target Encoding

In [209]:
len(df['customer_country.1'].unique())

405

In [210]:
from sklearn.model_selection import StratifiedKFold

# Ensure 'is_converted' is binary for stratification, convert if necessary
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype(int)

# Prepare the StratifiedKFold object
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['customer_country.1_te'] = np.nan

for train_index, test_index in skf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('customer_country.1')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'customer_country.1_te'] = X_test['customer_country.1'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['customer_country.1_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['customer_country.1', 'customer_country.1_te']].head())

  customer_country.1  customer_country.1_te
0        Philippines               0.093638
1        Philippines               0.091194
2              India               0.071457
3              India               0.070357
4              India               0.071825


In [211]:
# replace the original column with the target encoded column, but the order matters

df['customer_country.1'] = df['customer_country.1_te']
df.drop('customer_country.1_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

27


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'enterprise', 'historical_existing_cnt',
       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'customer_job', 'lead_desc_length', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'expected_timeline',
       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu',
       'business_area', 'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## business_unit

### 5fold te

In [212]:
df.business_unit.unique()

array(['AS', 'ID', 'IT', 'Solution', 'CM'], dtype=object)

In [213]:
# Apply one-hot encoding to 'business_unit'
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'business_area'
df['business_unit_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'business_area' on the training data
    means = X_train.groupby('business_unit')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'business_unit_te'] = X_val['business_unit'].map(means)

# Fill missing values for any 'business_area' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['business_unit_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['business_unit', 'business_unit_te']].head())

  business_unit  business_unit_te
0            AS          0.060187
1            AS          0.060187
2            AS          0.060283
3            AS          0.059962
4            AS          0.060107


In [214]:
df['business_unit'] = df['business_unit_te']
df.drop('business_unit_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

27


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'enterprise', 'historical_existing_cnt',
       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'customer_job', 'lead_desc_length', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'expected_timeline',
       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu',
       'business_area', 'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## Customer_Type

### 5-fold Target Encoding

In [215]:
print(df.customer_type.unique())
len(df.customer_type.unique())

['End-Customer' 'Specifier/ Influencer' 'Service Partner'
 'Channel Partner' 'Unknown' 'Corporate' 'End Customer'
 'Specifier / Influencer' 'Solution Eco-Partner' 'Distributor' 'Other'
 'System Integrator' 'Consultant' 'Installer' 'Homeowner' 'Others'
 'Technician' 'Installer/Contractor' 'Engineer' 'Manager / Director'
 'Developer' 'Etc.' 'Architect/Consultant' 'End-user' 'HVAC Engineer'
 'Reseller' 'Software/Solution Provider' 'Software / Solution Provider'
 'Dealer/Distributor' 'Technical Assistant' 'Commercial end-user'
 'Interior Designer' 'Home Owner' 'Administrator']


34

In [216]:
from sklearn.model_selection import KFold
import numpy as np

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'customer_type'
df['customer_type_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'customer_type' on the training data
    means = df_train.groupby('customer_type')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'customer_type_te'] = df_val['customer_type'].map(means)

# Fill missing values for any 'customer_type' not seen in the training folds
global_mean = df['is_converted'].mean()
df['customer_type_te'].fillna(global_mean, inplace=True)

In [217]:
# replace the original column with the target encoded column, but the order matters

df['customer_type'] = df['customer_type_te']
df.drop('customer_type_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

27


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'enterprise', 'historical_existing_cnt',
       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'customer_job', 'lead_desc_length', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'expected_timeline',
       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu',
       'business_area', 'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## Enterprise

### One-Hot Encoding

In [218]:
print(df.enterprise.unique())
len(df.enterprise.unique())

['Enterprise' 'SMB']


2

In [219]:
df_one_hot = pd.get_dummies(df, columns=['enterprise'], prefix='enterprise', drop_first=False)

# The drop_first=True option drops the first category column, avoiding multicollinearity
# 'prefix' is used to name the new column(s) based on the original column name for clarity

# # Display the first few rows to verify the new one-hot encoded column
# print(df_one_hot.head())

In [220]:
df = df_one_hot
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## customer_job

### 5fold Target Encoding

In [221]:
print(df.customer_job.unique())
len(df.customer_job.unique())

['purchasing' 'media and communication' 'engineering' 'entrepreneurship'
 'consulting' 'program and project management' 'sales' 'other'
 'operations' nan 'administrative' 'business development'
 'information technology' 'accounting' 'education' 'healthcare services'
 'human resources' 'support' 'finance' 'marketing' 'legal'
 'quality assurance' 'arts and design' 'real estate'
 'military and protective services' 'research' 'product management'
 'bidder' 'community and social services' 'research/install'
 'project manager' 'technical' 'details send' 'architect' 'curation'
 'developer/property' 'radiology professional' 'graphic/color art'
 'medical imaging specialist' 'medical solution provider' 'manager'
 'owner' 'designer' 'recommend' 'consultant / purchaser'
 'system installer' 'technical advisor, reseller' 'director it'
 'execution' 'owning company' 'president for sennco'
 'artist, lead on equipment selection' 'specifier / integrator'
 'systems designer' 'public bidder' 'technology de

561

In [222]:
# Initialize KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'customer_type'
df['customer_job_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'customer_type' on the training data
    means = df_train.groupby('customer_job')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'customer_job_te'] = df_val['customer_job'].map(means)

# Fill missing values for any 'customer_type' not seen in the training folds
global_mean = df['is_converted'].mean()
df['customer_job_te'].fillna(global_mean, inplace=True)

In [223]:
df['customer_job_te'].head()

0    0.143617
1    0.076423
2    0.093429
3    0.088910
4    0.098375
Name: customer_job_te, dtype: float64

In [224]:
df['customer_job'] = df['customer_job_te']
df.drop('customer_job_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## product_category

### 5fold TE

In [225]:
print(df.product_category.unique())
len(df.product_category.unique())

['multi-split' 'single-split' 'vrf' 'chiller' 'etc.' 'rac'
 'teto ou cassete inverter' nan 'software solution' 'all lg vrf systems'
 'ventilation' 'a thermodynamic water heater'
 'residential air conditioner' 'outros' 'heating' 'multi v5 vrf'
 'split tunggal' 'multi inverter' '×\x97×\x99×\x9e×\x95×\x9d'
 'standard signage' 'high brightness signage' 'interactive signage'
 'video wall signage' 'led signage' 'hotel tv' 'signage care solution'
 'oled signage' 'idb' 'video wall' 'one:quick series' 'commercial tv'
 'lg one:quick' 'special signage' 'other' 'accessories'
 'commercial display' 'interactive digital board' 'standard' 'uhd signage'
 'smart tv signage' 'lg one:quick series' 'pro:centric' 'ur640' 'ur640s'
 'webos' 'signage care solutions' 'aio | one quick' 'high brightness'
 'hospital tv' 'digital signage' 'monitor' 'pc' 'laptop' 'projector'
 'cloud device' 'medical display' 'washing machine,dryer' 'solar,chiller'
 'system ac,rac' 'monitor signage,commercial tv,monior/monitor tv'
 '

358

In [226]:
df.product_category.value_counts()
# print unique values and their value counts when the value count is more than 1000
df.product_category.value_counts()[df.product_category.value_counts() > 1000]

product_category
interactive digital board    6090
vrf                          5238
multi-split                  3548
video wall signage           2002
etc.                         1954
led signage                  1850
interactive signage          1829
single-split                 1524
rac                          1427
oled signage                 1279
hotel tv                     1083
chiller                      1002
Name: count, dtype: int64

In [227]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'product_category'
df['product_category_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'product_category' on the training data
    means = df_train.groupby('product_category')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'product_category_te'] = df_val['product_category'].map(means)

# Fill missing values for any 'product_category' not seen in the training folds
global_mean = df['is_converted'].mean()
df['product_category_te'].fillna(global_mean, inplace=True)

In [228]:
df.product_category.head()

0     multi-split
1     multi-split
2    single-split
3             vrf
4     multi-split
Name: product_category, dtype: object

In [229]:
df['product_category'] = df['product_category_te']
df.drop('product_category_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## product_subcategory

### 5fold te encoding

In [230]:
print(df.product_subcategory.unique())
len(df.product_subcategory.unique())

[nan 'Others' 'TR3DJ Series' 'Interactive Digital Board' 'TR3BF Series'
 'Transparent LED Film' 'One:Quick Flex' 'Transparent OLED Signage'
 '49" 500 nits FHD Slim Bezel Video Wall' 'Diagnostic Monitors'
 'All Medical Displays']


11

In [231]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'product_category'
df['product_subcategory_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'product_category' on the training data
    means = df_train.groupby('product_subcategory')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'product_subcategory_te'] = df_val['product_subcategory'].map(means)

# Fill missing values for any 'product_category' not seen in the training folds
global_mean = df['is_converted'].mean()
df['product_subcategory_te'].fillna(global_mean, inplace=True)

In [232]:
df['product_subcategory'] = df['product_subcategory_te']
df.drop('product_subcategory_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## product_modelname

### 5fold te encoding

In [233]:
print(df.product_modelname.unique())
len(df.product_modelname.unique())

[nan 'Others' '55VSM5J' '75TC3D' '43HT3WJ' '55CT5WJ'
 'UltraFine Ergo(32UN880)' 'Ergo Dual(27QP88D)' '28MQ780'
 'All Medical Displays' 'Diagnostic Monitors']


11

In [234]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'product_category'
df['product_modelname_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'product_category' on the training data
    means = df_train.groupby('product_modelname')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'product_modelname_te'] = df_val['product_modelname'].map(means)

# Fill missing values for any 'product_category' not seen in the training folds
global_mean = df['is_converted'].mean()
df['product_modelname_te'].fillna(global_mean, inplace=True)

In [235]:
df['product_modelname'] = df['product_modelname_te']
df.drop('product_modelname_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## customer_position

### 5fold te

In [236]:
print(df.customer_position.unique())
len(df.customer_position.unique())

['entry level' 'ceo/founder' 'partner' 'manager' 'vice president'
 'associate/analyst' 'c-level executive' 'none' 'director' 'other'
 'intern' 'trainee' 'others'
 'this is a consume display requirement for home purpose.' 'vicepresident'
 'consulting' 'entrylevel' 'c-levelexecutive' 'unpaid' 'physics teacher'
 'assistant professor' 'av management' 'maths lecturer' 'founder'
 'engineering' 'installer' 'homeowner' 'consultant' 'commercial end-user'
 'employee' 'bulgaria' 'lider de desarrollo' 'administrative'
 'entrepreneurship' 'decision-influencer' 'decision maker' 'customer'
 'not applicable' 'decision-maker' 'no influence' 'commercial consultant'
 'science teacher' 'decision influencer' 'technical'
 'architecture/consult' 'architect/consultant' 'exhibition' 'hospital'
 'end-user' 'government' 'manufacturer' 'software /solution provider'
 'system integrator' 'medical device manufacturer' 'distributor'
 'business unit director' 'business development' 'operations' 'vp' 'cargo'
 'guest fa

117

In [237]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

df['customer_position_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    means = X_train.groupby('customer_position')['is_converted'].mean()
    df.loc[val_idx, 'customer_position_te'] = X_val['customer_position'].map(means)

global_mean = df['is_converted'].mean()
df['customer_position_te'].fillna(global_mean, inplace=True)

In [238]:
df['customer_position'] = df['customer_position_te']
df.drop('customer_position_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## response_corporate

## 10fold te

In [239]:
print(df.response_corporate.unique())
len(df.response_corporate.unique())

['LGEPH' 'LGEIL' 'LGEAF' 'LGESJ' 'LGESL' 'LGESP' 'LGEGF' 'LGESA' 'LGEUS'
 'LGECB' 'LGEMS' 'LGEEG' 'LGEEF' 'LGEAP' 'LGEIN' 'LGEUK' 'LGEKR' 'LGEPS'
 'LGECI' 'LGECL' 'LGETK' 'LGELF' 'LGEPT' 'LGEPR' 'LGEDG' 'LGERO' 'LGEMK'
 'LGEPL' 'LGECZ' 'LGEES' 'LGEAR' 'LGEHK' 'LGEML' 'LGEJP' 'LGEHS' 'LGEAS'
 'LGEYK' 'LGEIS' 'LGEBN' 'LGEFS' 'LGESW' 'LGEMC' 'LGEAG' 'LGEEB' 'LGETH'
 'LGEVH' 'LGECH' 'LGELA' 'LGETT' 'LGERA' 'LGEUR' 'LGEIR' 'LGEBT']


53

In [240]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'response_corporate'
df['response_corporate_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'response_corporate' on the training data
    means = X_train.groupby('response_corporate')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'response_corporate_te'] = X_val['response_corporate'].map(means)

# Fill missing values for any 'response_corporate' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['response_corporate_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['response_corporate', 'response_corporate_te']].head())

  response_corporate  response_corporate_te
0              LGEPH               0.091503
1              LGEPH               0.091503
2              LGEIL               0.071439
3              LGEIL               0.071289
4              LGEIL               0.071841


In [241]:
df['response_corporate'] = df['response_corporate_te']
df.drop('response_corporate_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## expected_timeline

## 5fold te

In [242]:
print(df.expected_timeline.unique())
len(df.expected_timeline.unique())

['less than 3 months' 'Others' '3 months ~ 6 months' '9 months ~ 1 year'
 'more than a year' '6 months ~ 9 months' 'etc.']


7

In [243]:
# Assuming df is your DataFrame and 'is_converted' is the target variable
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'expected_timeline'
df['expected_timeline_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for 'expected_timeline' on the training data
    means = X_train.groupby('expected_timeline')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'expected_timeline_te'] = X_val['expected_timeline'].map(means)

# Fill missing values for any 'expected_timeline' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['expected_timeline_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['expected_timeline', 'expected_timeline_te']].head())

    expected_timeline  expected_timeline_te
0  less than 3 months              0.102528
1  less than 3 months              0.102528
2  less than 3 months              0.101502
3  less than 3 months              0.101749
4  less than 3 months              0.103633


In [244]:
df['expected_timeline'] = df['expected_timeline_te']
df.drop('expected_timeline_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## business_area

### 5fold te

In [245]:
print(df.business_area.unique())
len(df.business_area.unique())

['corporate / office' nan 'education' 'hotel & accommodation'
 'hospital & health care' 'special purpose' 'residential (home)'
 'government department' 'retail' 'factory'
 'power plant / renewable energy' 'transportation' 'public facility']


13

In [246]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'business_area'
df['business_area_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'business_area' on the training data
    means = X_train.groupby('business_area')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'business_area_te'] = X_val['business_area'].map(means)

# Fill missing values for any 'business_area' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['business_area_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['business_area', 'business_area_te']].head())

        business_area  business_area_te
0  corporate / office          0.064119
1  corporate / office          0.064119
2  corporate / office          0.067235
3  corporate / office          0.064915
4  corporate / office          0.064232


In [247]:
df['business_area'] = df['business_area_te']
df.drop('business_area_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## business_subarea

In [248]:
print(df.business_subarea.unique())
len(df.business_subarea.unique())

['Engineering' 'Advertising' 'Construction' 'IT/Software' nan
 'Manufacturing' 'Energy' 'Developer/Property' 'Entertainment'
 'Agriculture' 'Pharmaceutical' 'Others' 'Banking' 'Consulting'
 'Healthcare' 'Finance' 'Broadcasting & Media' 'Distribution Center'
 'Law Firm' 'Logistics' 'Telecommunication' 'Aerospace' 'Network/Cabling'
 'Insurance' 'Institute & Academy' 'Hotel' 'K12 Kindergarten & Schools'
 'Hospital' 'Mixed-use (Multi Complex)'
 'Telecom Base Station / Data, Call' 'Botanical Garden / Green House'
 'Higher Education (College & University)' 'Clinic' 'General Hospital'
 'Fitness' 'LTC(Long-Term Care)' 'Villa / Single-Family Home' 'Apartment'
 'Townhouse' 'Officetel' 'General Government Office' 'Military'
 'Police / Fire station' 'Welfare Facilities' 'Other Stores'
 'Luxury(Watch/Jewelry Shop)' 'Electronics & Telco' 'Sports Entertainment'
 'CVS (Convenience Store)' 'Car Dealership' 'Fashion'
 'QSR(Quick Service Restaurant)' 'Hyper Market & Grocery' 'Shopping Mall'
 'Cosmetics' 

87

In [249]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'business_area'
df['business_subarea_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'business_area' on the training data
    means = X_train.groupby('business_subarea')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'business_subarea_te'] = X_val['business_subarea'].map(means)

# Fill missing values for any 'business_area' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['business_subarea_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['business_subarea', 'business_subarea_te']].head())

  business_subarea  business_subarea_te
0      Engineering             0.123188
1      Advertising             0.084211
2     Construction             0.118421
3      IT/Software             0.132075
4              NaN             0.082825


In [250]:
df['business_subarea'] = df['business_subarea_te']
df.drop('business_subarea_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

28


Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## checking

In [251]:
# Select columns that are not of numeric types (int or float)
non_numeric_columns = df.select_dtypes(exclude=['int64', 'float64']).columns

# Check if there are any non-numeric columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {list(non_numeric_columns)}")
else:
    print("All columns are of int or float type.")

Non-numeric columns found: ['is_converted', 'enterprise_Enterprise', 'enterprise_SMB']


In [252]:
df.to_csv('encoded_train.csv', index=False)