# MEstimate Encoding categorical Features

In [1]:
import numpy as np
import pandas as pd
import string, copy, time

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
train_df = pd.read_csv("../../input/train.csv")
test_df = pd.read_csv("../../input/test.csv")

In [3]:
train_df.shape, test_df.shape

((900000, 33), (700000, 32))

In [4]:
train_df.head(2)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1


# Splitting feature 27 intelligently

By itself, we can see that feature 27 has close to 741k unique values in a train size of 900k entries. However, we can decompose these into a set of columns and then create counts of the alphabets as a feature.

In [5]:
train_df.f_27.nunique()

741354

In [6]:
# Define a default counter for all the alphabets
default_counter = {}
for s in string.ascii_uppercase:
    default_counter[s] = 0

In [7]:
# Define a function to get the alphabet counts in each record of the train/test df
def get_counts(x):
    counts = copy.deepcopy(default_counter)
    for letter in x:
        counts[letter] += 1
    return list(counts.values())

In [8]:
train_alphabet_counts = train_df.f_27.apply(lambda x: get_counts(x))
test_alphabet_counts = test_df.f_27.apply(lambda x: get_counts(x))

In [9]:
train_alphabet_df = pd.DataFrame(np.array(train_alphabet_counts.values.tolist()), columns = list(string.ascii_uppercase))
test_alphabet_df = pd.DataFrame(np.array(test_alphabet_counts.values.tolist()), columns = list(string.ascii_uppercase))

In [10]:
test_alphabet_df = pd.DataFrame(np.array(test_alphabet_counts.values.tolist()), columns = list(string.ascii_uppercase))

In [11]:
train_alphabet_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A,900000.0,2.460909,1.348131,0.0,2.0,2.0,3.0,9.0
B,900000.0,3.244169,1.422739,0.0,2.0,3.0,4.0,10.0
C,900000.0,1.478987,1.063639,0.0,1.0,1.0,2.0,7.0
D,900000.0,1.028972,0.929393,0.0,0.0,1.0,2.0,7.0
E,900000.0,0.597529,0.734487,0.0,0.0,0.0,1.0,5.0
F,900000.0,0.315301,0.547126,0.0,0.0,0.0,1.0,5.0
G,900000.0,0.163871,0.398628,0.0,0.0,0.0,0.0,4.0
H,900000.0,0.094059,0.301564,0.0,0.0,0.0,0.0,3.0
I,900000.0,0.066087,0.251902,0.0,0.0,0.0,0.0,3.0
J,900000.0,0.055603,0.230243,0.0,0.0,0.0,0.0,3.0


We can see that `U, V, W, X, Y, Z` never appear in the corpus. Only upto T, we find some representation of these alphabets in our feature, so we will only keep these counts till alphabet T in our corpus and add them to the original dataframe.

In [12]:
train_df = pd.concat([train_df, train_alphabet_df.iloc[:, :20]], axis = 1)
test_df = pd.concat([test_df, test_alphabet_df.iloc[:, :20]], axis = 1)

In [13]:
train_df.head(3)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,K,L,M,N,O,P,Q,R,S,T
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,0,0,0,0,0,0,0,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,0,0,0,0,0,0,0,0,0,0
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,...,1,0,0,0,0,0,0,0,0,0


In [14]:
test_df.head(3)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,K,L,M,N,O,P,Q,R,S,T
0,900000,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,...,0,1,0,0,0,0,0,0,0,0
1,900001,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,...,0,0,0,0,0,0,0,0,0,0
2,900002,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3,3,...,1,0,0,0,0,0,0,0,0,0


In [15]:
train_df.drop(columns = ["f_27", "id"], inplace = True)
test_ids = test_df["id"]
test_df.drop(columns = ["f_27", "id"], inplace = True)

## Fit the encoder

In [16]:
y = train_df.pop("target")
X = train_df
X_train_enc, X_encode, y_train_enc, y_encode = train_test_split(X, y, test_size = 0.2, random_state = 257)

In [17]:
cat_cols = [f"f_{fnum:0>2d}" for fnum in range(7, 19)] + ["f_29" , "f_30"]
num_cols = [y for y in X_train_enc.columns if (not y in cat_cols) and (y != "id") and (y != "f_27")]

In [18]:
encoder = MEstimateEncoder(cols = cat_cols, m = 1.5)
encoder.fit(X_encode, y_encode);

In [19]:
X_train_enc = encoder.transform(X_train_enc)
X_encode = encoder.transform(X_encode)
X_test = encoder.transform(test_df)

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_enc, y_train_enc, stratify = y_train_enc, test_size = 100000, random_state = 121)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((620000, 50), (620000,), (100000, 50), (100000,))

In [21]:
y_train.value_counts()

0    318308
1    301692
Name: target, dtype: int64

In [22]:
X_train_enc.head(2)

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,K,L,M,N,O,P,Q,R,S,T
229487,0.70304,1.246406,0.16075,-1.129079,0.415793,0.437626,-0.520668,0.48097,0.48235,0.500603,...,0,0,0,0,0,0,1,0,0,0
614596,0.478763,-0.316379,1.470314,0.401741,-1.752594,-0.294004,-0.927445,0.491573,0.506162,0.518572,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Define the preprocessing steps for numeric and categorical variables respectively
numeric_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'mean'))])

# Define the preprocesser using the above pipeline transforms
preprocessor = ColumnTransformer(transformers = [('numeric', numeric_transformer, num_cols + cat_cols)])

In [24]:
# Define the grid search object to perform fitting of our model
param_grid = {"max_depth": [3, 6, 9], "lambda": [1, 1.5, 2], "n_estimators":[100, 200, 300, 400, 500]}

gscv = GridSearchCV(estimator = XGBClassifier(use_label_encoder=False, eval_metric = "logloss"),
                    param_grid=param_grid,
                    scoring = "roc_auc", cv = 4, n_jobs = -1)

model_pipeline = Pipeline(steps = [("preprocessor", preprocessor),
                                   ("model", gscv)])

In [25]:
X_train.shape, y_train.shape

((620000, 50), (620000,))

In [26]:
start = time.time()
model = model_pipeline.fit(X_train, y_train)
print(f"Model fitting took {(time.time() - start)/60:.2f} minutes")
print(model)


KeyboardInterrupt



In [None]:
# Look at the score of the best model
model.steps[1][1].best_score_

In [None]:
# Look at the best parameters
model.steps[1][1].best_params_

In [None]:
# Validate the model on validation data
y_pred = model.predict_proba(X_valid)[:, 1]

# Print the validation loss
roc_auc_score(y_valid, y_pred)

In [None]:
# Predict on test data
pd.DataFrame({"id":test_ids,
              "target":model.predict_proba(X_test)[:, 1]}).to_csv("../../submissions/May_2022/submission_XGB_MEstimator_encoding.csv", index = None)

Process LokyProcess-3:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 464, in _process_worker
    mem_usage = _get_memory_usage(pid, force_gc=True)
  File "/opt/conda/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 117, in _get_memory_usage
    gc.collect()
KeyboardInterrupt


In [None]:
# !kaggle competitions submit -c tabular-playground-series-may-2022 -f ../../submissions/May_2022/submission_XGB_MEstimator_encoding.csv -m "XGBoost with Target Encoding Submission"

# Use the entire training set

In [None]:
final_X_train = pd.concat([X_train, X_valid, X_encode])
final_y_train = pd.concat([y_train, y_valid, y_encode])

final_model = model_pipeline.fit(final_X_train, final_y_train)
final_model

In [None]:
# Look at the score of the best model
final_model.steps[1][1].best_score_

In [None]:
# Look at the best parameters
final_model.steps[1][1].best_params_

In [None]:
# Predict on test data
pd.DataFrame({"Id":X_test.Id,
              "SalePrice":final_model.predict(X_test)}).to_csv("../../submissions/May_2022/submission_XGB_target_encoding_full.csv", index = None)