In [343]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

In [344]:
df = pd.read_csv('preprocessed_train_final.csv')
df.head()

Unnamed: 0,id,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,idit_strategic_ver,customer_job,...,product_modelname,customer_country.1,customer_position,response_corporate,ver_cus,ver_pro,ver_win_rate_x,business_area,lead_owner,is_converted
0,19844,0.0,ID,0.073248,47466,End Customer,Enterprise,53.0,0.0,consulting,...,,Brazil,none,LGESP,1,0,0.001183,retail,278,True
1,9738,0.25,IT,,5405,End Customer,SMB,0.0,0.0,,...,,United States,none,LGEUS,0,0,1.3e-05,transportation,437,True
2,8491,1.0,ID,,13597,Specifier / Influencer,SMB,0.0,0.0,information technology,...,Filled,,first_management_level,LGEGF,0,0,6e-05,hospital & health care,874,True
3,19895,0.5,ID,0.118644,17204,Unknown,Enterprise,0.0,0.0,sales,...,,United States,none,LGEUS,0,0,0.001183,retail,194,False
4,10465,1.0,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,engineering,...,Filled,Brazil,other,LGESP,1,1,0.003079,corporate / office,167,False


In [345]:
original_len = len(df.columns)
print(original_len)
# print columns giving numbers as index
for i in range(len(df.columns)):
    print(i+1, df.columns[i])

24
1 id
2 bant_submit
3 business_unit
4 com_reg_ver_win_rate
5 customer_idx
6 customer_type
7 enterprise
8 historical_existing_cnt
9 idit_strategic_ver
10 customer_job
11 lead_desc_length
12 inquiry_type
13 product_category
14 product_subcategory
15 product_modelname
16 customer_country.1
17 customer_position
18 response_corporate
19 ver_cus
20 ver_pro
21 ver_win_rate_x
22 business_area
23 lead_owner
24 is_converted


## 1. bant_submit -> no encoding

## 2. business_unit -> target encoding

In [346]:
df.business_unit.unique()

array(['ID', 'IT', 'AS'], dtype=object)

In [347]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['business_unit_te'] = np.nan

for train_index, test_index in kf.split(df, df['business_unit']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('business_unit')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'business_unit_te'] = X_test['business_unit'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['business_unit_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['business_unit', 'business_unit_te']].head())

  business_unit  business_unit_te
0            ID          0.233859
1            IT          0.486022
2            ID          0.239336
3            ID          0.236767
4            ID          0.233902


In [348]:
df['business_unit'] = df['business_unit_te']
df.drop('business_unit_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

24


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'inquiry_type', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted'],
      dtype='object')

## 3. com_reg_ver_win_rate -> no encoding (imputation later)

## 4. customer_idx -> changed to category -> target encoded

In [349]:
df['customer_idx'] = df['customer_idx'].astype('category')

In [350]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['customer_idx_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('customer_idx')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'customer_idx_te'] = X_test['customer_idx'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['customer_idx_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['customer_idx', 'customer_idx_te']].head())

  customer_idx  customer_idx_te
0        47466         0.059190
1         5405         0.000000
2        13597         1.000000
3        17204         0.221969
4         2329         0.000000


In [351]:
df['customer_idx'] = df['customer_idx_te']
df.drop('customer_idx_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

24


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'inquiry_type', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted'],
      dtype='object')

## 5 customer_type -> target encoding

In [352]:
len(df.customer_type.unique())

7

In [353]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['customer_type_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('customer_type')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'customer_type_te'] = X_test['customer_type'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['customer_type_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['customer_type', 'customer_type_te']].head())

            customer_type  customer_type_te
0            End Customer          0.256537
1            End Customer          0.256329
2  Specifier / Influencer          0.290960
3                 Unknown          0.053476
4            End Customer          0.255088


In [354]:
df['customer_type'] = df['customer_type_te']
df.drop('customer_type_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

24


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'enterprise',
       'historical_existing_cnt', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'inquiry_type', 'product_category',
       'product_subcategory', 'product_modelname', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted'],
      dtype='object')

## 6 enterprise -> OH

In [355]:
df_one_hot = pd.get_dummies(df, columns=['enterprise'], prefix='enterprise', drop_first=False)
df = df_one_hot
added_cols = len(df.columns)-original_len+1
print(added_cols)
print(df.columns)

2
Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'business_area', 'lead_owner', 'is_converted', 'enterprise_Enterprise',
       'enterprise_SMB'],
      dtype='object')


## 7 historical_existing_cnt -> no encoding

## 8 idit_strategic_ver -> no encoding

## 9 customer_job -> target encoding

In [356]:
len(df.customer_job.unique())

30

In [357]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['customer_job_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('customer_job')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'customer_job_te'] = X_test['customer_job'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['customer_job_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['customer_job', 'customer_job_te']].head())

             customer_job  customer_job_te
0              consulting         0.255034
1                     NaN         0.221969
2  information technology         0.271429
3                   sales         0.297297
4             engineering         0.183486


In [358]:
df['customer_job'] = df['customer_job_te']
df.drop('customer_job_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'business_area', 'lead_owner', 'is_converted', 'enterprise_Enterprise',
       'enterprise_SMB'],
      dtype='object')

## 10 lead_desc_length -> no encoding

## 11 inquiry_type -> target encoding

In [359]:
len(df.inquiry_type.unique())

9

In [360]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['inquiry_type_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('inquiry_type')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'inquiry_type_te'] = X_test['inquiry_type'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['inquiry_type_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['inquiry_type', 'inquiry_type_te']].head())

                         inquiry_type  inquiry_type_te
0                                 NaN         0.221969
1  Quotation or purchase consultation         0.257106
2  Quotation or purchase consultation         0.260160
3  Quotation or purchase consultation         0.259188
4  Quotation or purchase consultation         0.253841


In [361]:
df['inquiry_type'] = df['inquiry_type_te']
df.drop('inquiry_type_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'business_area', 'lead_owner', 'is_converted', 'enterprise_Enterprise',
       'enterprise_SMB'],
      dtype='object')

## 12 product_category ->

In [362]:
len(df.product_category.unique())

17

In [363]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['product_category_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('product_category')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'product_category_te'] = X_test['product_category'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['product_category_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['product_category', 'product_category_te']].head())

  product_category  product_category_te
0              NaN             0.221969
1              NaN             0.221969
2               tv             0.467742
3              NaN             0.221969
4          signage             0.239645


In [364]:
df['product_category'] = df['product_category_te']
df.drop('product_category_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'ver_cus', 'ver_pro', 'ver_win_rate_x',
       'business_area', 'lead_owner', 'is_converted', 'enterprise_Enterprise',
       'enterprise_SMB'],
      dtype='object')

## 13 product_subcategory -> OH

In [365]:
len(df.product_subcategory.unique())

2

In [366]:
df_one_hot = pd.get_dummies(df, columns=['product_subcategory'], prefix='product_subcategory', drop_first=False)
df = df_one_hot
added_cols = len(df.columns)-original_len+1
print(added_cols)
print(df.columns)

2
Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'product_modelname',
       'customer_country.1', 'customer_position', 'response_corporate',
       'ver_cus', 'ver_pro', 'ver_win_rate_x', 'business_area', 'lead_owner',
       'is_converted', 'enterprise_Enterprise', 'enterprise_SMB',
       'product_subcategory_Filled'],
      dtype='object')


## 14 product_modelname -> OH

In [367]:
len(df.product_modelname.unique())

2

In [368]:
df_one_hot = pd.get_dummies(df, columns=['product_modelname'], prefix='product_modelname', drop_first=False)
df = df_one_hot
added_cols = len(df.columns)-original_len+1
print(added_cols)
print(df.columns)

2
Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB', 'product_subcategory_Filled',
       'product_modelname_Filled'],
      dtype='object')


## 15 customer_country.1 -> target encoding

In [369]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['customer_country.1_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('customer_country.1')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'customer_country.1_te'] = X_test['customer_country.1'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['customer_country.1_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['customer_country.1', 'customer_country.1_te']].head())

  customer_country.1  customer_country.1_te
0             Brazil               0.256163
1      United States               0.382659
2                NaN               0.221969
3      United States               0.387571
4             Brazil               0.256959


In [370]:
df['customer_country.1'] = df['customer_country.1_te']
df.drop('customer_country.1_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB', 'product_subcategory_Filled',
       'product_modelname_Filled'],
      dtype='object')

## 16 customer_position -> ordinal encoding

In [371]:
print(len(df.customer_position.unique()))
df.customer_position.unique()

9


array(['none', 'first_management_level', 'other', 'intermediate_level',
       'middle_management_level', 'entry_level', 'c_level', 'end_user',
       'pgt chemistry'], dtype=object)

In [372]:
ordinal_mapping = {
    'none': 0,
    'other': 1,
    'entry_level': 2,
    'intermediate_level': 3,
    'first_management_level': 4,
    'middle_management_level': 5,
    'end_user': 6,
    'c_level': 7
}

# Apply the mapping to the column
df['customer_position_encoded'] = df['customer_position'].map(ordinal_mapping)

# Verify the changes by displaying the first few rows
print(df[['customer_position', 'customer_position_encoded']].head())

        customer_position  customer_position_encoded
0                    none                        0.0
1                    none                        0.0
2  first_management_level                        4.0
3                    none                        0.0
4                   other                        1.0


In [373]:
df['customer_position'] = df['customer_position_encoded']
df.drop('customer_position_encoded', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB', 'product_subcategory_Filled',
       'product_modelname_Filled'],
      dtype='object')

## 17 response_corporate -> target encoding

In [374]:
print(len(df.response_corporate.unique()))

41


In [375]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['response_corporate_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('response_corporate')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'response_corporate_te'] = X_test['response_corporate'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['response_corporate_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['response_corporate', 'response_corporate_te']].head())

  response_corporate  response_corporate_te
0              LGESP               0.256163
1              LGEUS               0.381594
2              LGEGF               0.135338
3              LGEUS               0.387486
4              LGESP               0.256959


In [376]:
df['response_corporate'] = df['response_corporate_te']
df.drop('response_corporate_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB', 'product_subcategory_Filled',
       'product_modelname_Filled'],
      dtype='object')

## 18 ver_cus -> no encoding

## 19 ver_pro -> no encoding

## 20 ver_win_rate_x -> Imputation needed

## 21 business_area -> target encoding

In [377]:
print(len(df.business_area.unique()))

13


In [378]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['business_area_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('business_area')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'business_area_te'] = X_test['business_area'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['business_area_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['business_area', 'business_area_te']].head())

            business_area  business_area_te
0                  retail          0.265018
1          transportation          0.535620
2  hospital & health care          0.355372
3                  retail          0.276364
4      corporate / office          0.324952


In [379]:
df['business_area'] = df['business_area_te']
df.drop('business_area_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB', 'product_subcategory_Filled',
       'product_modelname_Filled'],
      dtype='object')

## 22 lead_owner -> category -> target encoding

In [380]:
df['lead_owner'] = df['lead_owner'].astype('category')
df['lead_owner'].head()

0    278
1    437
2    874
3    194
4    167
Name: lead_owner, dtype: category
Categories (440, int64): [0, 1, 2, 3, ..., 1062, 1063, 1084, 1108]

In [381]:
# Convert boolean True/False to 1/0 directly
df['is_converted'] = df['is_converted'].astype('int64')

# Prepare the KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for the target encoded feature
df['lead_owner_te'] = np.nan

for train_index, test_index in kf.split(df, df['is_converted']):
    # Split the data
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    
    # Compute target mean for each category in 'customer_country.1' excluding the current fold
    means = X_train.groupby('lead_owner')['is_converted'].mean()
    
    # Map the computed means to the test portion of the fold
    df.loc[test_index, 'lead_owner_te'] = X_test['lead_owner'].map(means)

# For any missing values in the target encoded feature (e.g., categories present only in test folds),
# fill with the global mean of 'is_converted'
global_mean = df['is_converted'].mean()
df['lead_owner_te'].fillna(global_mean, inplace=True)

# Show the first few rows to verify
print(df[['lead_owner', 'lead_owner_te']].head())

  lead_owner  lead_owner_te
0        278       0.555556
1        437       0.720588
2        874       0.200000
3        194       0.041322
4        167       0.428571


In [382]:
df['lead_owner'] = df['lead_owner_te']
df.drop('lead_owner_te', axis=1, inplace=True)
print(len(df.columns))
df.columns

25


Index(['id', 'bant_submit', 'business_unit', 'com_reg_ver_win_rate',
       'customer_idx', 'customer_type', 'historical_existing_cnt',
       'idit_strategic_ver', 'customer_job', 'lead_desc_length',
       'inquiry_type', 'product_category', 'customer_country.1',
       'customer_position', 'response_corporate', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'business_area', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB', 'product_subcategory_Filled',
       'product_modelname_Filled'],
      dtype='object')

## Iterative Imputation

In [383]:
# Initialize the IterativeImputer
iterative_imputer = IterativeImputer(max_iter=100, random_state=0)

# Apply the imputer to the entire DataFrame
# Note: Ensure 'df' contains only numerical data at this point
imputed_data = iterative_imputer.fit_transform(df)

# Convert the imputed data back into a DataFrame
# This step is necessary to retain your column names and DataFrame structure
df_imputed = pd.DataFrame(imputed_data, columns=df.columns)

# Display the first few rows of the imputed DataFrame
print(df_imputed.head())

        id  bant_submit  business_unit  com_reg_ver_win_rate  customer_idx  \
0  19844.0         0.00       0.233859              0.073248      0.059190   
1   9738.0         0.25       0.486022              0.436869      0.000000   
2   8491.0         1.00       0.239336              0.233180      1.000000   
3  19895.0         0.50       0.236767              0.118644      0.221969   
4  10465.0         1.00       0.233902              0.074949      0.000000   

   customer_type  historical_existing_cnt  idit_strategic_ver  customer_job  \
0       0.256537                     53.0                 0.0      0.255034   
1       0.256329                      0.0                 0.0      0.221969   
2       0.290960                      0.0                 0.0      0.271429   
3       0.053476                      0.0                 0.0      0.297297   
4       0.255088                      2.0                 1.0      0.183486   

   lead_desc_length  ...  ver_cus  ver_pro  ver_win_rate

## checking

In [384]:
# Select columns that are not of numeric types (int or float)
non_numeric_columns = df.select_dtypes(exclude=['int64', 'int32', 'float64']).columns

# Check if there are any non-numeric columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {list(non_numeric_columns)}")
    
else:
    print("All columns are of int or float type.")

Non-numeric columns found: ['enterprise_Enterprise', 'enterprise_SMB', 'product_subcategory_Filled', 'product_modelname_Filled']


In [385]:
df.enterprise_Enterprise.head()

#convert df.enterprise_Enterprise to int
df['enterprise_Enterprise'] = df['enterprise_Enterprise'].astype('int64')
df['enterprise_SMB'] = df['enterprise_SMB'].astype('int64')
df['product_subcategory_Filled'] = df['product_subcategory_Filled'].astype('int64')
df['product_modelname_Filled'] = df['product_modelname_Filled'].astype('int64')

df['enterprise_Enterprise'].head()
df['enterprise_SMB'].head()

0    0
1    1
2    1
3    0
4    0
Name: enterprise_SMB, dtype: int64

In [386]:
non_numeric_columns = df.select_dtypes(exclude=['int64', 'int32', 'float64']).columns

# Check if there are any non-numeric columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {list(non_numeric_columns)}")
else:
    print("All columns are of int or float type.")

All columns are of int or float type.


In [387]:
df.to_csv('encoded_train_final_target.csv', index=False)