In [1]:
!pip install xgboost
!pip install imbalanced-learn
!pip install --upgrade xgboost
!pip install catboost
!pip install category_encoders




In [2]:
!pip install --upgrade scikit-build setuptools

!pip install pycaret #pycaret 설치




In [3]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import catboost
from catboost import CatBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTEENN

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

df_train = pd.read_csv("lowercase_train.csv")
df_test = pd.read_csv("submission_lowercase.csv")

def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]



In [4]:
#process country
df_train["customer_country"] = df_train["customer_country"].str.split('/').str[-1].str.strip()
rows_to_remove = df_train["customer_country"].str.contains('[0-9!@#$%^&*(),.?":{}|<>]', na=False, regex=True)
df_train.loc[rows_to_remove, "customer_country"] = ''

df_test["customer_country"] = df_test["customer_country"].str.split('/').str[-1].str.strip()
rows_to_remove = df_test["customer_country"].str.contains('[0-9!@#$%^&*(),.?":{}|<>]', na=False, regex=True)
df_test.loc[rows_to_remove, "customer_country"] = ''

In [5]:
#process customer job not sure to use this
df_train["customer_job"] = df_train["customer_job"].str.split('/').str[0].str.strip()
df_test["customer_job"] = df_test["customer_job"].str.split('/').str[0].str.strip()

In [6]:
#inquirt type has many data count 1
#and there's other others other_ <- need those 3?

In [7]:
#process category
df_train["product_category"] = df_train["product_category"].str.replace('[^a-zA-Z0-9\s]', '', regex=True)
df_test["product_category"] = df_test["product_category"].str.replace('[^a-zA-Z0-9\s]', '', regex=True)

In [8]:
#process modelname
df_train["product_modelname"] = df_train["product_modelname"].str.replace(r'\([^)]*\)', '', regex=True).str.strip()
df_test["product_modelname"] = df_test["product_modelname"].str.replace(r'\([^)]*\)', '', regex=True).str.strip()
df_train["product_modelname"] = df_train["product_modelname"].str.replace(r'-[^ ]*', '', regex=True).str.strip()
df_test["product_modelname"] = df_test["product_modelname"].str.replace(r'-[^ ]*', '', regex=True).str.strip()


In [9]:
#process expected timeline
df_train["expected_timeline"] = df_train["expected_timeline"].str.replace('_', ' ').str.rstrip('.')
df_test["expected_timeline"] = df_test["expected_timeline"].str.replace('_', ' ').str.rstrip('.')
df_train["expected_timeline"] = df_train["expected_timeline"].str.split().str[:3].str.join(' ')
df_test["expected_timeline"] = df_test["expected_timeline"].str.split().str[:3].str.join(' ')
df_train["expected_timeline"] = df_train["expected_timeline"].str.replace(r'[.,/].*', '', regex=True)
df_test["expected_timeline"] = df_test["expected_timeline"].str.replace(r'[.,/].*', '', regex=True)


In [10]:
df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

# Assuming 'it' and 'id' are the columns you want to fill with 0
#columns_to_fill = ['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']

# Fill NaN values with 0 in the specified columns for both df_train and df_test
#df_train.loc[:, columns_to_fill] = df_train.loc[:, columns_to_fill].fillna(0)
#df_test.loc[:, columns_to_fill] = df_test.loc[:, columns_to_fill].fillna(0)


# Sum of "id_strategic_ver", "it_strategic_ver", "idit_strategic_ver"
#df_train["sum_strategic_ver"] = 3 * df_train["id_strategic_ver"] + 2 * df_train["it_strategic_ver"] + df_train["idit_strategic_ver"]
#df_test["sum_strategic_ver"] = 3 * df_test["id_strategic_ver"] + 2 * df_test["it_strategic_ver"] + df_test["idit_strategic_ver"]

#df_train.dropna(axis=1, inplace=True)
df_train.fillna(0, inplace=True)
df_train.drop(['customer_country.1'], axis=1, inplace=True)

#df_test.dropna(axis=1, inplace=True)
df_test.fillna(0, inplace=True)
df_test.drop(['customer_country.1'], axis=1, inplace=True)

In [11]:

# TRUE와 FALSE 개수 세기
true_count = df_train['is_converted'].sum()
false_count = len(df_train) - true_count

# 두 개수 중 작은 값으로 데이터를 분할
min_count = min(true_count, false_count)

# TRUE와 FALSE 개수를 맞추어 데이터를 분할
true_data = df_train[df_train['is_converted'] == True].sample(n=min_count, random_state=400)
false_data = df_train[df_train['is_converted'] == False].sample(n=min_count, random_state=400)

# 데이터를 결합
df_balanced = pd.concat([true_data, false_data])

# val set을 먼저 구성
val_size = int(len(df_balanced) * 0.2)  # 전체 데이터의 20%를 val set으로 사용
val_set = df_balanced.sample(n=val_size, random_state=400)

# val_set을 x_val과 y_val로 분리
x_val = val_set.drop("is_converted", axis=1)
y_val = val_set["is_converted"]

# train set 구성 (비율을 맞추기 전의 데이터 사용)
train_set = df_train.drop(val_set.index)

# train set과 val set 구성 확인
print("Train set:")
print(train_set['is_converted'].value_counts())
print("Validation set:")
print(val_set['is_converted'].value_counts())


Train set:
False    53483
True      3876
Name: is_converted, dtype: int64
Validation set:
True     974
False    966
Name: is_converted, dtype: int64


In [12]:

from imblearn.under_sampling import TomekLinks, RandomUnderSampler
import pandas as pd


# Tomek Link를 적용하여 데이터 보정
tl = TomekLinks()
x_train_resampled, y_train_resampled = tl.fit_resample(train_set.drop("is_converted", axis=1), train_set["is_converted"])

# 클래스 비율 맞추기
rus = RandomUnderSampler(sampling_strategy={0: int(y_train_resampled.sum() * 1), 1: y_train_resampled.sum()}, random_state=42)
x_train, y_train = rus.fit_resample(x_train_resampled, y_train_resampled)

# 결과 확인
print("Train set after Tomek Links and balancing classes:")
print(pd.Series(y_train).value_counts())


Train set after Tomek Links and balancing classes:
False    3876
True     3876
Name: is_converted, dtype: int64


In [13]:
y_train_str = y_train.astype(str)
y_val_str = y_val.astype(str)

param_grid = {
    'iterations': [800],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'loss_function': ['Logloss'],
    'cat_features': [list(set(label_columns) & set(x_train.columns))]
}

# Create the CatBoostClassifier
base_model = CatBoostClassifier()

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train_str, eval_set=(x_val, y_val_str), early_stopping_rounds=50, verbose=100)

# Get the best parameters and best model
best_params = grid_search.best_params_
model = grid_search.best_estimator_

#columns_to_drop = ["it_strategic_ver"]
#df_train.drop(columns=columns_to_drop, inplace=True)



# Use "sum_strategic_ver" column for training
x_train_non_null = x_train.dropna(axis=1)
x_val_non_null = x_val[x_train_non_null.columns]
x_test_non_null = df_test[x_train_non_null.columns]

# Continue with the rest of your code

0:	learn: 0.6840213	test: 0.6843217	best: 0.6843217 (0)	total: 70.6ms	remaining: 56.4s
100:	learn: 0.3366914	test: 0.3461772	best: 0.3461772 (100)	total: 2.26s	remaining: 15.7s
200:	learn: 0.2693628	test: 0.2824037	best: 0.2824037 (200)	total: 3.6s	remaining: 10.7s
300:	learn: 0.2391926	test: 0.2551235	best: 0.2551235 (300)	total: 4.35s	remaining: 7.21s
400:	learn: 0.2215867	test: 0.2405939	best: 0.2405939 (400)	total: 5.05s	remaining: 5.03s
500:	learn: 0.2098375	test: 0.2312234	best: 0.2312234 (500)	total: 5.84s	remaining: 3.48s
600:	learn: 0.2006069	test: 0.2249723	best: 0.2249723 (600)	total: 6.64s	remaining: 2.2s
700:	learn: 0.1925184	test: 0.2197813	best: 0.2197813 (700)	total: 7.4s	remaining: 1.04s
799:	learn: 0.1861668	test: 0.2160630	best: 0.2160630 (799)	total: 8.06s	remaining: 0us

bestTest = 0.2160630166
bestIteration = 799

0:	learn: 0.6803662	test: 0.6806665	best: 0.6806665 (0)	total: 6.96ms	remaining: 5.56s
100:	learn: 0.3454724	test: 0.3514217	best: 0.3514217 (100)	total

In [14]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("Confusion Matrix:\n", confusion)
    print("\nAccuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

pred = model.predict(x_val_non_null)
pred = [label == 'True' for label in pred]

get_clf_eval(y_val, pred)

Confusion Matrix:
 [[918  56]
 [ 66 900]]

Accuracy: 0.9371
Precision: 0.9329
Recall: 0.9425
F1: 0.9377


In [15]:
test_pred = model.predict(x_test_non_null)
test_pred = [label == 'True' for label in test_pred]

sum(test_pred)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub.to_csv("submission.csv", index=False)
