pip install xgboost

In [330]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("submission.csv")

def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [331]:
true_count = df_train['is_converted'].sum()
false_count = len(df_train) - true_count
min_count = min(true_count, false_count)
true_data = df_train[df_train['is_converted'] == True].sample(n=min_count, random_state=400)
false_data = df_train[df_train['is_converted'] == False].sample(n=min_count, random_state=400)
df_balanced = pd.concat([true_data, false_data])

x_train, x_val, y_train, y_val = train_test_split(
    df_balanced.drop(["id_strategic_ver", "it_strategic_ver", "idit_strategic_ver", "customer_country.1"], axis=1),
    df_balanced["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [332]:
'''model = XGBClassifier()
model_param_grid = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.1, 0.15, 0.2, 0.3],
    'max_depth': [6, 8, 10, 12]
}'''
model = RandomForestClassifier()

model_param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model_grid = GridSearchCV(model, param_grid=model_param_grid, scoring="accuracy", n_jobs=-1, verbose=1)
model_grid.fit(x_train.fillna(0), y_train)
model = model_grid.best_estimator_
model.fit(x_train.fillna(0), y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [333]:
feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

correlation_matrix = x_train.corr()

correlation_with_conversion = correlation_matrix['is_converted'].abs().sort_values(ascending=False)

top_correlation_features = correlation_with_conversion.index[1:11]

top_importance_features = feature_importance_df['Feature'].iloc[1:11].tolist()

selected_features = list(set(top_importance_features) | set(top_correlation_features))

x_train_selected = x_train[selected_features]
x_val_selected = x_val[selected_features]
x_test_selected = df_test[selected_features]

model.fit(x_train_selected.fillna(0), y_train)

In [334]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("Confusion Matrix:\n", confusion)
    print("\nAccuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))


pred = model.predict(x_val_selected.fillna(0))
print(selected_features)
get_clf_eval(y_val, pred)

['response_corporate', 'lead_desc_length', 'business_subarea', 'customer_type', 'historical_existing_cnt', 'ver_win_rate_x', 'com_reg_ver_win_rate', 'product_subcategory', 'customer_country', 'ver_win_ratio_per_bu', 'enterprise', 'customer_idx', 'lead_owner', 'business_unit']
Confusion Matrix:
 [[914  60]
 [ 78 888]]

Accuracy: 0.9289
Precision: 0.9214
Recall: 0.9384
F1: 0.9298


In [335]:
x_test = x_test_selected.drop(["is_converted", "id"], axis=1)

test_pred = model.predict(x_test_selected.fillna(0))
sum(test_pred)

df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub.to_csv("submission.csv", index=False)


KeyError: "['is_converted', 'id'] not found in axis"