In [15]:
!pip install xgboost
!pip install imbalanced-learn
!pip install --upgrade xgboost
!pip install catboost
!pip install category_encoders




In [16]:
!pip install --upgrade scikit-build setuptools

!pip install pycaret #pycaret 설치




In [17]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import catboost
from catboost import CatBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTEENN

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("submission.csv")

def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

# Assuming 'it' and 'id' are the columns you want to fill with 0
#columns_to_fill = ['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']

# Fill NaN values with 0 in the specified columns for both df_train and df_test
#df_train.loc[:, columns_to_fill] = df_train.loc[:, columns_to_fill].fillna(0)
#df_test.loc[:, columns_to_fill] = df_test.loc[:, columns_to_fill].fillna(0)


# Sum of "id_strategic_ver", "it_strategic_ver", "idit_strategic_ver"
#df_train["sum_strategic_ver"] = 3 * df_train["id_strategic_ver"] + 2 * df_train["it_strategic_ver"] + df_train["idit_strategic_ver"]
#df_test["sum_strategic_ver"] = 3 * df_test["id_strategic_ver"] + 2 * df_test["it_strategic_ver"] + df_test["idit_strategic_ver"]

df_train.dropna(axis=1, inplace=True)
df_train.drop(['customer_country.1', 'customer_country'], axis=1, inplace=True)

df_test.dropna(axis=1, inplace=True)
df_test.drop(['customer_country.1', 'customer_country'], axis=1, inplace=True)


In [18]:

# TRUE와 FALSE 개수 세기
true_count = df_train['is_converted'].sum()
false_count = len(df_train) - true_count

# 두 개수 중 작은 값으로 데이터를 분할
min_count = min(true_count, false_count)

# TRUE와 FALSE 개수를 맞추어 데이터를 분할
true_data = df_train[df_train['is_converted'] == True].sample(n=min_count, random_state=400)
false_data = df_train[df_train['is_converted'] == False].sample(n=min_count, random_state=400)

# 데이터를 결합
df_balanced = pd.concat([true_data, false_data])

# val set을 먼저 구성
val_size = int(len(df_balanced) * 0.2)  # 전체 데이터의 20%를 val set으로 사용
val_set = df_balanced.sample(n=val_size, random_state=400)

# val_set을 x_val과 y_val로 분리
x_val = val_set.drop("is_converted", axis=1)
y_val = val_set["is_converted"]

# train set 구성 (비율을 맞추기 전의 데이터 사용)
train_set = df_train.drop(val_set.index)

# train set과 val set 구성 확인
print("Train set:")
print(train_set['is_converted'].value_counts())
print("Validation set:")
print(val_set['is_converted'].value_counts())


Train set:
False    53483
True      3876
Name: is_converted, dtype: int64
Validation set:
True     974
False    966
Name: is_converted, dtype: int64


In [19]:

from imblearn.under_sampling import TomekLinks, RandomUnderSampler
import pandas as pd


# Tomek Link를 적용하여 데이터 보정
tl = TomekLinks()
x_train_resampled, y_train_resampled = tl.fit_resample(train_set.drop("is_converted", axis=1), train_set["is_converted"])

# 클래스 비율 맞추기
rus = RandomUnderSampler(sampling_strategy={0: int(y_train_resampled.sum() * 1), 1: y_train_resampled.sum()}, random_state=42)
x_train, y_train = rus.fit_resample(x_train_resampled, y_train_resampled)

# 결과 확인
print("Train set after Tomek Links and balancing classes:")
print(pd.Series(y_train).value_counts())


Train set after Tomek Links and balancing classes:
False    3876
True     3876
Name: is_converted, dtype: int64


In [20]:
model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.05, loss_function='Logloss', cat_features=[1, 2, 3])

model.fit(x_train, y_train, eval_set=(x_val, y_val), early_stopping_rounds=50, verbose=100)


#columns_to_drop = ["it_strategic_ver"]
#df_train.drop(columns=columns_to_drop, inplace=True)



# Use "sum_strategic_ver" column for training
x_train_non_null = x_train.dropna(axis=1)
x_val_non_null = x_val[x_train_non_null.columns]
x_test_non_null = df_test[x_train_non_null.columns]

# Continue with the rest of your code

0:	learn: 0.6243931	test: 0.6220990	best: 0.6220990 (0)	total: 22.9ms	remaining: 11.4s
100:	learn: 0.1366518	test: 0.1834555	best: 0.1834555 (100)	total: 2.09s	remaining: 8.26s
200:	learn: 0.0955829	test: 0.1686498	best: 0.1686498 (200)	total: 4.13s	remaining: 6.14s
300:	learn: 0.0725573	test: 0.1607623	best: 0.1607623 (300)	total: 6.05s	remaining: 4s
400:	learn: 0.0562815	test: 0.1577633	best: 0.1575123 (366)	total: 7.93s	remaining: 1.96s
499:	learn: 0.0448987	test: 0.1570372	best: 0.1566673 (481)	total: 9.81s	remaining: 0us

bestTest = 0.1566672665
bestIteration = 481

Shrink model to first 482 iterations.


In [21]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("Confusion Matrix:\n", confusion)
    print("\nAccuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

pred = model.predict(x_val_non_null)
pred = [label == 'True' for label in pred]

get_clf_eval(y_val, pred)

Confusion Matrix:
 [[918  56]
 [ 72 894]]

Accuracy: 0.9340
Precision: 0.9273
Recall: 0.9425
F1: 0.9348


In [22]:
test_pred = model.predict(x_test_non_null)
test_pred = [label == 'True' for label in test_pred]

sum(test_pred)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub.to_csv("submission.csv", index=False)
