pip install xgboost

In [86]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("submission.csv")

def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [87]:
true_count = df_train['is_converted'].sum()
false_count = len(df_train) - true_count
min_count = min(true_count, false_count)
true_data = df_train[df_train['is_converted'] == True].sample(n=min_count, random_state=400)
false_data = df_train[df_train['is_converted'] == False].sample(n=min_count, random_state=400)
df_balanced = pd.concat([true_data, false_data])

x_train, x_val, y_train, y_val = train_test_split(
    df_balanced.drop(["is_converted", "id_strategic_ver", "it_strategic_ver", "idit_strategic_ver", "customer_country.1"], axis=1),
    df_balanced["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [88]:
model = XGBClassifier()
model_param_grid = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.1, 0.15, 0.2, 0.3],
    'max_depth': [6, 8, 10, 12]
}

model_grid = GridSearchCV(model, param_grid=model_param_grid, scoring="accuracy", n_jobs=-1, verbose=1)
model_grid.fit(x_train, y_train)
model = model_grid.best_estimator_
model.fit(x_train, y_train)

# Use features with non-null values for training and prediction
x_train_non_null = x_train.dropna(axis=1)
x_val_non_null = x_val[x_train_non_null.columns]
x_test_non_null = df_test[x_train_non_null.columns]

model.fit(x_train_non_null, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [89]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("Confusion Matrix:\n", confusion)
    print("\nAccuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

pred = model.predict(x_val_non_null)

get_clf_eval(y_val, pred)

Confusion Matrix:
 [[925  49]
 [ 74 892]]

Accuracy: 0.9366
Precision: 0.9259
Recall: 0.9497
F1: 0.9377


In [90]:
test_pred = model.predict(x_test_non_null)
sum(test_pred)

df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub.to_csv("submission.csv", index=False)
