In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
train_df = pd.read_csv("../../input/train.csv")
test_df = pd.read_csv("../../input/test.csv")

In [3]:
train_df.shape, test_df.shape

((900000, 33), (700000, 32))

In [4]:
train_df.head(2)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1


In [5]:
y = train_df.pop("target")
X = train_df
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify = y, test_size = 200000)

X_test = test_df

In [6]:
y_train.value_counts()

0    359459
1    340541
Name: target, dtype: int64

In [7]:
X_train.iloc[0]

id          484569
f_00      0.679338
f_01      0.318183
f_02     -1.671308
f_03     -1.751046
f_04      -0.07602
f_05     -0.483074
f_06     -0.809365
f_07             1
f_08             4
f_09             2
f_10             2
f_11             2
f_12             2
f_13             2
f_14             1
f_15             2
f_16             1
f_17             2
f_18             1
f_19      2.770492
f_20      0.272204
f_21      0.713919
f_22       0.15752
f_23     -0.048831
f_24      -1.95694
f_25     -0.267224
f_26     -2.077421
f_27    BAAGBBCKCB
f_28   -191.148968
f_29             1
f_30             1
Name: 484569, dtype: object

In [8]:
for fnum in range(7, 19):
    colname = f"f_{fnum:0>2d}"
    print(colname, X_train[colname].nunique())

f_07 16
f_08 16
f_09 14
f_10 15
f_11 14
f_12 16
f_13 13
f_14 14
f_15 15
f_16 15
f_17 14
f_18 14


In [9]:
cols = ["f_27", "f_29", "f_30"]
for colname in cols:
    print(colname, X_train[colname].nunique())

f_27 597997
f_29 2
f_30 3


For baseline, let us drop column f_27 since it has a very high proportion of categorical information. Let us also encode other categorical information using One hot encoding and then build a simple random forest regressor model and mark our entry into the competition.

In [10]:
cat_cols = [f"f_{fnum:0>2d}" for fnum in range(7, 19)] + ["f_29" , "f_30"]
num_cols = [y for y in X_train.columns if (not y in cat_cols) and (y != "id") and (y != "f_27")]
len(num_cols), len(cat_cols)

(16, 14)

In [11]:
# Define the preprocessing steps for numeric and categorical variables respectively
numeric_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'mean')),
                                        ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')),
                                            ('encoder', OneHotEncoder(handle_unknown = 'ignore'))])

# Define the preprocesser using the above pipeline transforms
preprocessor = ColumnTransformer(transformers = [('numeric', numeric_transformer, num_cols),
                                                 ('categorical', categorical_transformer, cat_cols)])

In [12]:
# Define the grid search object to perform fitting of our model
param_grid = {"max_depth": [3, 6, 9], "lambda": np.arange(0.75, 1, 3, 5)}

gscv = GridSearchCV(estimator = XGBRegressor(),
                    param_grid=param_grid,
                    scoring = "roc_auc", cv = 4, n_jobs = -1)

model_pipeline = Pipeline(steps = [("preprocessor", preprocessor),
                                   ("model", gscv)])

In [13]:
X_train.shape, y_train.shape

((700000, 32), (700000,))

In [None]:
model = model_pipeline.fit(X_train, y_train)
print(model)

In [None]:
# Look at the score of the best model
model.steps[1][1].best_score_

In [None]:
# Look at the best parameters
model.steps[1][1].best_params_

In [None]:
# Validate the model on validation data
y_pred = model.predict(X_valid)

# Print the validation metric
roc_auc_score(y_pred, y_valid)

In [None]:
# Look at the submission format
! head -n 10 ./input/sample_submission.csv

In [None]:
# Predict on test data
pd.DataFrame({"Id":X_test.Id,
              "SalePrice":model.predict(X_test)}).to_csv("../../submissions/May_2022/submission_xgboost.csv", index = None)

# Use the entire training set

In [None]:
final_X_train = pd.concat([X_train, X_valid])
final_y_train = pd.concat([y_train, y_valid])

final_model = model_pipeline.fit(final_X_train, final_y_train)
final_model

In [None]:
# Look at the score of the best model
(-1 * final_model.steps[1][1].best_score_) ** 0.5

In [None]:
# Look at the best parameters
final_model.steps[1][1].best_params_

In [None]:
# Predict on test data
pd.DataFrame({"Id":X_test.Id,
              "SalePrice":final_model.predict(X_test)}).to_csv("../../submissions/May_2022/submission_xgboost_full.csv", index = None)