In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import KFold


In [2]:
df = pd.read_csv('preprocessed_train_final.csv')
df.head()

Unnamed: 0,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,idit_strategic_ver,customer_job,lead_desc_length,...,product_modelname,customer_country.1,customer_position,response_corporate,ver_cus,ver_pro,ver_win_rate_x,business_area,lead_owner,is_converted
0,1.0,AS,0.066667,32160,End Customer,Enterprise,0.0,0.0,purchasing,50.0,...,,Philippines,entry_level,LGEPH,1,0,0.003079,corporate / office,0,True
1,1.0,AS,0.066667,23122,End Customer,Enterprise,12.0,0.0,media and communication,100.0,...,,Philippines,c_level,LGEPH,1,0,0.003079,corporate / office,1,True
2,1.0,AS,0.088889,1755,End Customer,Enterprise,144.0,0.0,engineering,50.0,...,,India,c_level,LGEIL,1,0,0.003079,corporate / office,2,True
3,1.0,AS,0.088889,4919,End Customer,Enterprise,0.0,0.0,entrepreneurship,50.0,...,,India,c_level,LGEIL,1,0,0.003079,corporate / office,3,True
4,1.0,AS,0.088889,17126,Specifier / Influencer,Enterprise,0.0,0.0,consulting,100.0,...,,India,c_level,LGEIL,0,0,0.003079,corporate / office,4,True


In [305]:
print(len(df.columns))
df.columns

30


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

In [306]:
# categorical columns

for col in df.columns:
    if df[col].dtype != 'int64' and df[col].dtype != 'float64':
        print(col)

customer_country
business_unit
customer_type
enterprise
customer_job
inquiry_type
product_category
product_subcategory
product_modelname
customer_country.1
customer_position
response_corporate
expected_timeline
business_area
business_subarea
is_converted


In [307]:
#numerical columns

for col in df.columns:
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        print(col)

id
bant_submit
com_reg_ver_win_rate
customer_idx
historical_existing_cnt
id_strategic_ver
it_strategic_ver
idit_strategic_ver
lead_desc_length
ver_cus
ver_pro
ver_win_rate_x
ver_win_ratio_per_bu
lead_owner


## customer_country.1

### 5 fold Target Encoding

In [308]:
len(df['customer_country.1'].unique())

48

In [309]:
# Ensure 'is_converted' is binary for stratification, convert if necessary
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['customer_country.1_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('customer_country.1')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'customer_country.1_te'] = X_test['customer_country.1'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['customer_country.1_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['customer_country.1', 'customer_country.1_te']].head())

  customer_country.1  customer_country.1_te
0             Brazil               0.256163
1      United States               0.382659
2                NaN               0.221969
3      United States               0.387571
4             Brazil               0.256959


In [310]:
# replace the original column with the target encoded column, but the order matters

df['customer_country.1'] = df['customer_country.1_te']
df.drop('customer_country.1_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

30


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## customer_country

In [311]:
len(df['customer_country'].unique())

48

In [312]:
# Ensure 'is_converted' is binary for stratification, convert if necessary

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['customer_country_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('customer_country')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'customer_country_te'] = X_test['customer_country'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['customer_country_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['customer_country', 'customer_country_te']].head())

  customer_country  customer_country_te
0           Brazil             0.256163
1    United States             0.382659
2              NaN             0.221969
3    United States             0.387571
4           Brazil             0.256959


In [313]:
df['customer_country'] = df['customer_country_te']
df.drop('customer_country_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

30


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## inquiry_type

In [314]:
df['inquiry_type'].unique()

array([nan, 'Quotation or Purchase Consultation', 'Product Information',
       'Technical Consultation', 'Others', 'Request for Partnership',
       'Usage or Technical Consultation', 'Services',
       'Customer Suggestions', 'Request a Demo', 'Trainings', 'Other',
       'OEM/ODM Request', 'Media Inquiry', 'other_',
       'Request for Distributorship'], dtype=object)

In [315]:
# Ensure 'is_converted' is binary for stratification, convert if necessary

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['inquiry_type_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('inquiry_type')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'inquiry_type_te'] = X_test['inquiry_type'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['inquiry_type_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['inquiry_type', 'inquiry_type_te']].head())

                         inquiry_type  inquiry_type_te
0                                 NaN         0.221969
1  Quotation or Purchase Consultation         0.257106
2  Quotation or Purchase Consultation         0.260160
3  Quotation or Purchase Consultation         0.259188
4  Quotation or Purchase Consultation         0.253841


In [316]:
df['inquiry_type'] = df['inquiry_type_te']
df.drop('inquiry_type_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

30


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## business_unit

### 5fold te

In [317]:
df.business_unit.unique()

array(['ID', 'IT', 'AS'], dtype=object)

In [318]:


# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'business_area'
df['business_unit_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'business_area' on the training data
    means = X_train.groupby('business_unit')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'business_unit_te'] = X_val['business_unit'].map(means)

# Fill missing values for any 'business_area' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['business_unit_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['business_unit', 'business_unit_te']].head())

  business_unit  business_unit_te
0            ID          0.231104
1            IT          0.487981
2            ID          0.240128
3            ID          0.234889
4            ID          0.234889


In [319]:
df['business_unit'] = df['business_unit_te']
df.drop('business_unit_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

30


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## Customer_Type

### 5-fold Target Encoding

In [320]:
print(df.customer_type.unique())
len(df.customer_type.unique())

['End Customer' 'Specifier/ Influencer' 'Unknown' 'Service Partner'
 'Channel Partner' 'Solution Eco-Partner' 'Developer' 'End-Customer']


8

In [321]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'customer_type'
df['customer_type_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'customer_type' on the training data
    means = df_train.groupby('customer_type')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'customer_type_te'] = df_val['customer_type'].map(means)

# Fill missing values for any 'customer_type' not seen in the training folds
global_mean = df['is_converted'].mean()
df['customer_type_te'].fillna(global_mean, inplace=True)

In [322]:
# replace the original column with the target encoded column, but the order matters

df['customer_type'] = df['customer_type_te']
df.drop('customer_type_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

30


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted'],
      dtype='object')

## Enterprise

### One-Hot Encoding

In [323]:
print(df.enterprise.unique())
len(df.enterprise.unique())

df.enterprise.head()

['Enterprise' 'SMB']


0    Enterprise
1           SMB
2           SMB
3    Enterprise
4    Enterprise
Name: enterprise, dtype: object

In [324]:
df_one_hot = pd.get_dummies(df, columns=['enterprise'], prefix='enterprise', drop_first=False)

# The drop_first=True option drops the first category column, avoiding multicollinearity
# 'prefix' is used to name the new column(s) based on the original column name for clarity

# # Display the first few rows to verify the new one-hot encoded column
print(df_one_hot['enterprise_Enterprise'].head())
print(df_one_hot['enterprise_SMB'].head())

0     True
1    False
2    False
3     True
4     True
Name: enterprise_Enterprise, dtype: bool
0    False
1     True
2     True
3    False
4    False
Name: enterprise_SMB, dtype: bool


In [325]:
df = df_one_hot
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## customer_job

### 5fold Target Encoding

In [326]:
print(df.customer_job.unique())
len(df.customer_job.unique())

['consulting' nan 'information technology' 'sales' 'engineering'
 'marketing' 'purchasing' 'others' 'operations' 'support' 'finance'
 'business development' 'administrative' 'education' 'product management'
 'program and project management' 'entrepreneurship'
 'community and social services' 'media and communication'
 'arts and design' 'healthcare services' 'accounting'
 'medical imaging specialist' 'clinical specialist' 'curation' 'other'
 '3d/vfx art' 'legal' 'radiology professional'
 'military and protective services' 'real estate' 'human resources'
 'healthcare' 'electronics & telco' 'quality assurance' 'pathologist'
 'research' 'surgery professional' 'film production' 'k12 school'
 'media and communications' 'medical solution provider']


42

In [327]:
# Initialize KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'customer_type'
df['customer_job_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'customer_type' on the training data
    means = df_train.groupby('customer_job')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'customer_job_te'] = df_val['customer_job'].map(means)

# Fill missing values for any 'customer_type' not seen in the training folds
global_mean = df['is_converted'].mean()
df['customer_job_te'].fillna(global_mean, inplace=True)

In [328]:
df['customer_job_te'].head()

0    0.255034
1    0.221969
2    0.271429
3    0.297297
4    0.183486
Name: customer_job_te, dtype: float64

In [329]:
df['customer_job'] = df['customer_job_te']
df.drop('customer_job_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## product_category

### 5fold TE

In [330]:
print(df.product_category.unique())
len(df.product_category.unique())

[nan 'hospital tv' 'led signage' 'cloud device' 'standard signage'
 'interactive signage' 'multi-split' 'video wall signage' 'vrf' 'monitor'
 'etc.' 'heating' 'ventilation' 'control' 'single-split'
 'high brightness signage' 'chiller' 'oled signage' 'special signage'
 'hotel tv' 'projector' 'medical display' 'signage care solution' 'ess'
 'laptop' 'interactive digital board' 'pro:centric' 'software solution'
 'others' 'one:quick series' 'webos' 'notebook' 'accessories' 'other']


34

In [331]:
df.product_category.value_counts()
# print unique values and their value counts when the value count is more than 1000
df.product_category.value_counts()[df.product_category.value_counts() > 1000]

Series([], Name: count, dtype: int64)

In [332]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'product_category'
df['product_category_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'product_category' on the training data
    means = df_train.groupby('product_category')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'product_category_te'] = df_val['product_category'].map(means)

# Fill missing values for any 'product_category' not seen in the training folds
global_mean = df['is_converted'].mean()
df['product_category_te'].fillna(global_mean, inplace=True)

In [333]:
df['product_category'] = df['product_category_te']
df.drop('product_category_te', axis=1, inplace=True)
print(len(df.columns))
print(df.columns)
df['product_category'].head()

31
Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')


0    0.221969
1    0.221969
2    0.750000
3    0.221969
4    0.309091
Name: product_category, dtype: float64

## product_subcategory

### 5fold te encoding

In [334]:
print(df.product_subcategory.unique())
len(df.product_subcategory.unique())

[nan 'Others' 'Digital Signage or Commercial TVs' 'TR3DJ Series'
 'Transparent OLED Signage' 'Interactive Digital Board'
 'Curvable OLED Signage' 'Transparent LED Film' 'High Brightness Series'
 '55" 700 nits FHD 0.44mm Even Bezel Video Wall'
 '55" 500 nits FHD 0.44mm Even Bezel Video Wall']


11

In [335]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'product_category'
df['product_subcategory_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'product_category' on the training data
    means = df_train.groupby('product_subcategory')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'product_subcategory_te'] = df_val['product_subcategory'].map(means)

# Fill missing values for any 'product_category' not seen in the training folds
global_mean = df['is_converted'].mean()
df['product_subcategory_te'].fillna(global_mean, inplace=True)

In [336]:
df['product_subcategory'] = df['product_subcategory_te']
df.drop('product_subcategory_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## product_modelname

### 5fold te encoding

In [337]:
print(df.product_modelname.unique())
len(df.product_modelname.unique())

[nan 'Others' '55VH7J-H' '65TR3DJ' '55EW5TK-A' '55EF5F-L' 'LAT140'
 'LWBC039' '55VSH7J' '75TR3DJ' '55VSM5J']


11

In [338]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'product_category'
df['product_modelname_te'] = np.nan

# Apply K-fold target encoding
for train_idx, val_idx in kf.split(df):
    # Split the data
    df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
    # Compute the target mean for each category in 'product_category' on the training data
    means = df_train.groupby('product_modelname')['is_converted'].mean()
    # Map the computed means to the validation set
    df.loc[val_idx, 'product_modelname_te'] = df_val['product_modelname'].map(means)

# Fill missing values for any 'product_category' not seen in the training folds
global_mean = df['is_converted'].mean()
df['product_modelname_te'].fillna(global_mean, inplace=True)

In [339]:
df['product_modelname'] = df['product_modelname_te']
df.drop('product_modelname_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## customer_position

### 5fold te

In [340]:
print(df.customer_position.unique())
len(df.customer_position.unique())

['none' 'manager' 'others' 'associate/analyst' 'director' 'entry level'
 'ceo/founder' 'partner' 'c-level executive' 'trainee' 'vice president'
 'intern' 'educator' 'other' 'customer'
 'academic coordinator/ post graduate teacher (accountancy, business studies)/ tgt (ict)'
 'assistant professor' 'math and physics teacher' 'pgt chemistry'
 'director cum faculty at gaining apex coaching centre' 'professor'
 'co-founder' 'asst prof.']


23

In [341]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

df['customer_position_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    means = X_train.groupby('customer_position')['is_converted'].mean()
    df.loc[val_idx, 'customer_position_te'] = X_val['customer_position'].map(means)

global_mean = df['is_converted'].mean()
df['customer_position_te'].fillna(global_mean, inplace=True)

In [342]:
df['customer_position'] = df['customer_position_te']
df.drop('customer_position_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## response_corporate

## 10fold te

In [343]:
print(df.response_corporate.unique())
len(df.response_corporate.unique())

['LGESP' 'LGEUS' 'LGEGF' 'LGEDG' 'LGEEG' 'LGEFS' 'LGEIN' 'LGEIL' 'LGEIS'
 'LGEAR' 'LGETK' 'LGESJ' 'LGEPH' 'LGEPR' 'LGEVH' 'LGEAP' 'LGECL' 'LGECB'
 'LGEHK' 'LGEMS' 'LGETH' 'LGEMK' 'LGEBN' 'LGEPS' 'LGEUK' 'LGESA' 'LGESL'
 'LGEYK' 'LGEAG' 'LGEEF' 'LGECH' 'LGECI' 'LGEAF' 'LGEPL' 'LGEES' 'LGEHS'
 'LGEAS' 'LGELF' 'LGERO' 'LGEPT' 'LGEML']


41

In [344]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'response_corporate'
df['response_corporate_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'response_corporate' on the training data
    means = X_train.groupby('response_corporate')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'response_corporate_te'] = X_val['response_corporate'].map(means)

# Fill missing values for any 'response_corporate' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['response_corporate_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['response_corporate', 'response_corporate_te']].head())

  response_corporate  response_corporate_te
0              LGESP               0.251225
1              LGEUS               0.388191
2              LGEGF               0.139344
3              LGEUS               0.388265
4              LGESP               0.257282


In [345]:
df['response_corporate'] = df['response_corporate_te']
df.drop('response_corporate_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## expected_timeline

## 5fold te

In [346]:
print(df.expected_timeline.unique())
len(df.expected_timeline.unique())

['etc' 'less than 3 months' 'more than a year' '6 months ~ 9 months'
 '3 months ~ 6 months' '9 months ~ 1 year']


6

In [347]:
# Assuming df is your DataFrame and 'is_converted' is the target variable
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'expected_timeline'
df['expected_timeline_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for 'expected_timeline' on the training data
    means = X_train.groupby('expected_timeline')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'expected_timeline_te'] = X_val['expected_timeline'].map(means)

# Fill missing values for any 'expected_timeline' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['expected_timeline_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['expected_timeline', 'expected_timeline_te']].head())

    expected_timeline  expected_timeline_te
0                 etc              0.221014
1                 etc              0.217617
2  less than 3 months              0.228758
3    more than a year              0.210280
4  less than 3 months              0.228096


In [348]:
df['expected_timeline'] = df['expected_timeline_te']
df.drop('expected_timeline_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## business_area

### 5fold te

In [349]:
print(df.business_area.unique())
len(df.business_area.unique())

['retail' 'transportation' 'hospital & health care' 'corporate / office'
 nan 'residential (home)' 'education' 'factory' 'special purpose'
 'hotel & accommodation' 'public facility' 'government department'
 'power plant / renewable energy']


13

In [350]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'business_area'
df['business_area_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'business_area' on the training data
    means = X_train.groupby('business_area')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'business_area_te'] = X_val['business_area'].map(means)

# Fill missing values for any 'business_area' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['business_area_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['business_area', 'business_area_te']].head())

            business_area  business_area_te
0                  retail          0.248996
1          transportation          0.532544
2  hospital & health care          0.339623
3                  retail          0.261224
4      corporate / office          0.331897


In [351]:
df['business_area'] = df['business_area_te']
df.drop('business_area_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## business_subarea

In [352]:
print(df.business_subarea.unique())
len(df.business_subarea.unique())

['Electronics & Telco' 'Others' 'General Hospital' nan 'Engineering'
 'Banking' 'Hospital' 'K12 Kindergarten & Schools' 'Construction'
 'Advertising' 'Higher Education (College & University)'
 'Manufacturing Factory / Plant' 'Other Stores' 'Fashion'
 'Telecommunication' 'Mixed-use (Multi Complex)' 'Institute & Academy'
 'Fitness' 'IT/Software' 'Restaurant' 'Cosmetics' 'Healthcare' 'Hotel'
 'Military' 'Clinic' 'Manufacturing' 'Apartment' 'Townhouse' 'Resort'
 'Finance' 'Religious Facility' 'Broadcasting & Media' 'Holdings'
 'Villa / Single-Family Home' 'Power Plant' 'Developer/Property'
 'Renewable Energy' 'Entertainment' 'Sports Entertainment'
 'Telecom Base Station / Data, Call' 'Hyper Market & Grocery'
 'Bus Terminal' 'Luxury(Watch/Jewelry Shop)' 'Logistics' 'Gas Station'
 'Energy' 'General Government Office' 'Outdoor Advertisement' 'Investment'
 'Shopping Mall' 'Energy Storage & Saving' 'F&B(Food and Beverage)'
 'Exhibition / Convention Center' 'Consulting' 'Insurance'
 'QSR(Quick S

81

In [353]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a new column for the target-encoded 'business_area'
df['business_subarea_te'] = np.nan

for train_idx, val_idx in kf.split(df):
    # Split the data into training and validation sets
    X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
    
    # Compute the target mean for each category in 'business_area' on the training data
    means = X_train.groupby('business_subarea')['is_converted'].mean()
    
    # Map the computed means to the validation set
    df.loc[val_idx, 'business_subarea_te'] = X_val['business_subarea'].map(means)

# Fill missing values for any 'business_area' not seen in the training folds with the global mean
global_mean = df['is_converted'].mean()
df['business_subarea_te'].fillna(global_mean, inplace=True)

# Verify the changes
print(df[['business_subarea', 'business_subarea_te']].head())

      business_subarea  business_subarea_te
0  Electronics & Telco             0.192308
1               Others             0.441065
2     General Hospital             0.200000
3                  NaN             0.221969
4          Engineering             0.275862


In [354]:
df['business_subarea'] = df['business_subarea_te']
df.drop('business_subarea_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

31


Index(['id', 'bant_submit', 'customer_country', 'business_unit',
       'com_reg_ver_win_rate', 'customer_idx', 'customer_type',
       'historical_existing_cnt', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

## checking

In [355]:
# Select columns that are not of numeric types (int or float)
non_numeric_columns = df.select_dtypes(exclude=['int64', 'int32', 'float64']).columns

# Check if there are any non-numeric columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {list(non_numeric_columns)}")
else:
    print("All columns are of int or float type.")

Non-numeric columns found: ['enterprise_Enterprise', 'enterprise_SMB']


In [356]:
df.enterprise_Enterprise.head()

#convert df.enterprise_Enterprise to int
df['enterprise_Enterprise'] = df['enterprise_Enterprise'].astype('int64')
df['enterprise_SMB'] = df['enterprise_SMB'].astype('int64')

df['enterprise_Enterprise'].head()
df['enterprise_SMB'].head()

0    0
1    1
2    1
3    0
4    0
Name: enterprise_SMB, dtype: int64

In [357]:
non_numeric_columns = df.select_dtypes(exclude=['int64', 'int32', 'float64']).columns

# Check if there are any non-numeric columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {list(non_numeric_columns)}")
else:
    print("All columns are of int or float type.")

All columns are of int or float type.


In [358]:
df.to_csv('encoded_submission_raw.csv', index=False)