In [1]:
!pip install lightgbm==4.3.0
!pip install catboost==1.2.2

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import random
import os
import torch

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [3]:
CFG = {
    'SEED':1813
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## 데이터 불러오기

In [5]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

## 데이터 전처리

In [6]:
def inquery_types(data: pd.DataFrame):
    # NaN to Other
    data['inquiry_type'].fillna('other', inplace=True)

    # 전부 소문자로 치환
    data['inquiry_type'] = data['inquiry_type'].str.lower()

    # _ 문자를 공백으로 치환
    data['inquiry_type'] = data['inquiry_type'].str.replace('_', ' ')

    # ~~ 상담 제거
    data['inquiry_type'] = data['inquiry_type'].str.replace(' consultation', '')
    data['inquiry_type'] = data['inquiry_type'].str.replace(' consulting', '')

    # '용법' 이란 단어와 '기술/기능'이라는 같은 의미를 가진다고 해석
    data['inquiry_type'] = data['inquiry_type'].str.replace('usage or technical', 'technical')

    # '기술 지원' 과 '기술'은 동일하다고 해석
    data['inquiry_type'] = data['inquiry_type'].str.replace('technical support', 'technical')

    # '견적' 과 '결제'는 동일하다고 해석
    data['inquiry_type'] = data['inquiry_type'].str.replace('quotation or purchase', 'purchase')
    data['inquiry_type'] = data['inquiry_type'].str.replace('purchase or quotation', 'purchase')
    data['inquiry_type'] = data['inquiry_type'].str.replace('quotation', 'purchase')
    data['inquiry_type'] = data['inquiry_type'].str.replace('purchase ', 'purchase')

    # '판매 문의'와 '판매'는 동일하다 해석
    data['inquiry_type'] = data['inquiry_type'].str.replace('sales inquiry', 'sales')

    # 기타 (Others, Other, etc.) 묶기
    data['inquiry_type'] = data['inquiry_type'].str.replace('others', 'other')
    data['inquiry_type'] = data['inquiry_type'].str.replace('etc.', 'other')
    data['inquiry_type'] = data['inquiry_type'].str.replace('other ', 'other')

    # '~~에 대한 문의'라는 문장 제거
    data['inquiry_type'] = data['inquiry_type'].str.replace('request for ', '')

    value_counts_ = data['inquiry_type'].value_counts()
    to_replace = value_counts_[value_counts_ <= 2].index
    data['inquiry_type'] = data['inquiry_type'].apply(lambda x: 'other' if x in to_replace else x)
    return data


def switch_country(df):
    world_list = ['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Central African Rep', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Hong Kong', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'North Korea', 'South Korea', 'Kosovo', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico', 'Micronesia', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Kitts and Nevis', 'St Lucia', 'Saint Vincent & the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Swaziland', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Togo', 'Tonga', 'Trinidad & Tobago', 'Tunisia', 'Turkey', 'Türkiye', 'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine', 'U.A.E', 'UAE', 'United Kingdom', 'United States', 'US', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Vatican City', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe']

    countries = []

    for country in list(df['customer_country']):
        try:
            split_ = country.split('/')
        except:
            if type(country) == float:
                countries.append('other')
            continue

        found = False

        for c in world_list:
            if c.upper() in country.upper():
                found = True

                if c.upper() == 'UAE':
                    countries.append('U.A.E')
                elif c.upper() == 'US':
                    countries.append('United States')
                elif c == 'Türkiye':
                    countries.append('Turkey')
                else:
                    countries.append(c)
                break

        if not found:
            countries.append('other')

    df['customer_country'] = countries
    df.drop(['customer_country.1'], inplace=True, axis=1)
    return df


def customer_type(data: pd.DataFrame) -> pd.DataFrame:

    # 소문자 치환
    data['customer_type'] = data['customer_type'].str.lower()

    # - 문자 제거
    data['customer_type'] = data['customer_type'].str.replace('-', ' ')

    # '기타' 통합
    data['customer_type'] = data['customer_type'].str.replace('others', 'other')
    data['customer_type'] = data['customer_type'].str.replace('etc.', 'other')

    # '/' 문자 삭제
    data['customer_type'] = data['customer_type'].str.replace('/', ' ')
    data['customer_type'] = data['customer_type'].str.replace('  ', ' ')
    data['customer_type'] = data['customer_type'].str.replace('  ', ' ')

    # 'Home owner'와 'Homeowner' 통합
    data['customer_type'] = data['customer_type'].str.replace('homeowner', 'home owner')
    return data


def product_category(data: pd.DataFrame):
    # 소문자로 치환
    data['product_category'] = data['product_category'].str.lower()

    # Other 처리 (다국어로 되어 있는 경우가 있음)
    data['product_category'] = data['product_category'].replace(
        ['outros', 'lainnya', 'otros', 'others', 'etc.', 'autre', 'khác', 'ฯลฯ'], 'other')
    data['product_category'] = data['product_category'].replace(['אחר'], 'other')

    # lg 제거 ('lg one'과 'one'의 결합)
    data['product_category'] = data['product_category'].str.replace('lg ', '')

    # 복수형 삭제
    data['product_category'] = data['product_category'].replace(
        ['medical displays', 'medical display', 'medical monitors', 'medical  surgical'], 'medical monitor')
    data['product_category'] = data['product_category'].str.replace('medical  surgical', 'medical monitor')

    # : 문자 제거
    data['product_category'] = data['product_category'].str.replace(':', '')
    data['product_category'] = data['product_category'].str.replace('-', ' ')

    # onequick 합치기
    data['product_category'] = data['product_category'].str.replace('onequick series', 'onequick')
    data['product_category'] = data['product_category'].str.replace('one quickflex', 'onequick')
    data['product_category'] = data['product_category'].str.replace('onequick flex', 'onequick')
    data['product_category'] = data['product_category'].str.replace('one quick works', 'onequick')

    # VRF 합치기
    data['product_category'] = data['product_category'].str.replace('نظام التدفق المتغيرvrf', 'vrf')
    data['product_category'] = data['product_category'].replace(
        ['multi v5 vrf', 'multi v 5 air', 'vrf   multi v s', 'all vrf systems', 'kimatyzacja vrf', 'multi v',
         'multi v water 5', 'ahu', 'điều hòa trung tâm vrf'], 'vrf')

    # commercial displays -> commercial display
    data['product_category'] = data['product_category'].str.replace('commercial displays', 'commercial display')

    # signage care solutions => signage care solution
    data['product_category'] = data['product_category'].replace(['signage care solutions'], 'signage care solution')

    # TV 품명으로 되어 있는 데이터 합치기
    # 1. Commercial Displays
    data['product_category'] = data['product_category'].replace(['49vl5f', '49vl5g m', 'ur640s', 'ur640'],
                                                                'commercial display')

    # 2. Commercial TV
    data['product_category'] = data['product_category'].str.replace('ur640s', 'commercial tv')
    data['product_category'] = data['product_category'].str.replace('comercial tv', 'commercial tv')
    data['product_category'] = data['product_category'].replace(
        ['50uq801c0sb.bwz', '50us660h0sd.bwz', '55uq801c0sb.bwz', '55us660h0sd.bwz', '43uq751c0sb.bwz',
         '43uq751c0sf.bwz'], 'commercial tv')

    # 3. Monitor
    data['product_category'] = data['product_category'].replace(['28mq780'], 'monitor')

    # 4. Projecter
    data['product_category'] = data['product_category'].replace(['bu50nst'], 'projector')

    # 5. Led signage
    data['product_category'] = data['product_category'].replace(['gsca046', 'gscd046', 'gscd100'], 'led signage')

    # UHD signage
    data['product_category'] = data['product_category'].replace(['86uh5f', '98uh5e'], 'uhd signage')

    # video wall
    data['product_category'] = data['product_category'].replace(['55vm5e a', '55vm5j h', '55svh7f a', '49vl5g m.awzm'],
                                                                'video wall')

    # 중국어 -> 영어
    data['product_category'] = data['product_category'].replace(['oled 顯示屏'], 'oled signage')
    data['product_category'] = data['product_category'].replace(['led 顯示屏'], 'led signage')
    data['product_category'] = data['product_category'].replace(['互動式顯示屏'], 'interactive signage')

    # 주택용 에어컨, 주거용 에어컨
    data['product_category'] = data['product_category'].replace(
        ['ar condicionado residencial', 'ac rumah', 'climatiseur résidentiel', 'เครื่องปรับอากาศเผื่อที่อยู่อาศัย',
         'aire acondicionado residencial', 'điều hòa gia dụng'], 'residential air conditioner')

    data['product_category'] = data['product_category'].replace(['מזגנים למקום מגורים'], 'residential air conditioner')

    # 호텔용 티비 (酒店電視 - 중국어)
    data['product_category'] = data['product_category'].replace(
        ['hoteleria_us670h', '酒店電視', '43us660h0sd.awz', 'hospitality', 'htv', '43us660h (na)'], 'hotel tv')

    # high brightness signage (high brightness signage 부류 중 제일 많음)
    data['product_category'] = data['product_category'].replace(['high brightness', '高亮度顯示屏'],
                                                                'high brightness signage')

    # Multi-split
    data['product_category'] = data['product_category'].replace(
        ['multi split (plusieurs pièces)', 'klimatyzacja multi split', 'multi inverter'], 'multi split')
    data['product_category'] = data['product_category'].replace(['פיצול מרובה'], 'multi split')

    # heating (isıtma - 터키어 난방) (ogrzewanie (pompy ciepła) - 폴란드어), (calefacción  - 스페인어) (aquecimento = 포르투칼어)
    data['product_category'] = data['product_category'].replace(
        ['isıtma', 'ogrzewanie (pompy ciepła)', 'calefacción', 'aquecimento'], 'heating')
    data['product_category'] = data['product_category'].replace(['حلول التدفئة'], 'heating')
    data['product_category'] = data['product_category'].replace(['חימום'], 'heating')

    # chiller (soğutucu - 터키어 냉각기)
    data['product_category'] = data['product_category'].replace(['soğutucu'], 'chiller')
    data['product_category'] = data['product_category'].replace(['مبرد (تشيلر)'], 'chiller')

    # hospital tv
    data['product_category'] = data['product_category'].replace(['醫院電視'], 'hospital tv')

    # standard display (standard display 부류 중 제일 많음)
    data['product_category'] = data['product_category'].replace(['標準顯示屏', 'standard', 'standard display'],
                                                                'standard signage')

    # software
    data['product_category'] = data['product_category'].replace(['軟體'], 'software solution')

    # special signage
    data['product_category'] = data['product_category'].replace(['特別顯示屏'], 'special signage')

    ## ===== 동일 모델별 분류 =====
    # interactive signage 를 interactive digital board 로 귀속 (동일한 모델명 존재) (standard display 부류 중 제일 많음)
    data['product_category'] = data['product_category'].replace(['interactive signage', '55tc3d'],
                                                                'interactive digital board')
    # idb 를 interactive digital board 로 귀속 (동일한 모델명 존재 + IDB는 interactive digital board의 약어일 가능성)
    data['product_category'] = data['product_category'].replace(['idb'], 'interactive digital board')

    # commercial tv 를 commercial display 로 귀속 (동일한 모델명 존재) (commercial display 부류 중 제일 많음)
    data['product_category'] = data['product_category'].replace(['commercial tv'], 'commercial display')

    # video wall signage 귀속 (video wall signage 부류 중 제일 많음)
    data['product_category'] = data['product_category'].replace(
        ['videwall', 'videowall', 'videowall_rmk', 'videowall signage', 'video wall', '110 + video wall'],
        'video wall signage')

    # tv 귀속 (tv 부류 중 제일 많음)
    data['product_category'] = data['product_category'].replace(['tv signage', 'tv 60"', 'tv 55"', 'tv 43 pol'], 'tv')

    # smart TV
    data['product_category'] = data['product_category'].replace(['smart tv'], 'smart tv signage')

    return data


def customer_job(data: pd.DataFrame):
    data['customer_job'] = data['customer_job'].fillna('unknown')
    data['customer_job'] = data['customer_job'].str.lower()
    data['customer_job'] = data['customer_job'].str.replace('-', '')
    data['customer_job'] = data['customer_job'].str.replace('_', '')
    data['customer_job'] = data['customer_job'].str.replace(' ', '')

    # 'information'가 포함된 카테고리를 'information technology'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'informationtechnology' if 'information' in x else x)

    # 'design'이나 'art' 포함된 카테고리를 'arts and design'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'artsanddesign' if 'design' in x or 'art' in x else x)

    # 'consult'가 포함된 카테고리를 'consulting'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'consulting' if 'consult' in x else x)

    # 'architect'가 포함된 카테고리를 'architect'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'architect' if 'architect' in x else x)

    # 'technical'가 포함된 카테고리를 'technical'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'technical' if 'technical' in x else x)

    # 'engineer'나 'engineering' 포함된 카테고리를 'engineering'로 귀속
    data['customer_job'] = data['customer_job'].apply(
        lambda x: 'engineering' if 'engineer' in x or 'engineering' in x else x)

    # 'project'나 'projekt'가 포함된 카테고리를 'programandprojectmanagement'로 귀속
    data['customer_job'] = data['customer_job'].apply(
        lambda x: 'programandprojectmanagement' if 'project' in x or 'projekt' in x else x)

    # 'marketing'가 포함된 카테고리를 'marketing'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'marketing' if 'marketing' in x else x)

    # 'purchase'가 포함된 카테고리를 'purchasing'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'purchasing' if 'purchase' in x else x)

    # 'research'가 포함된 카테고리를 'research'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'research' if 'research' in x else x)

    # 'sales'가 포함된 카테고리를 'sales'로 귀속
    data['customer_job'] = data['customer_job'].apply(lambda x: 'sales' if 'sales' in x
                                                                           or 'vendite' in x
                                                                           or 'vertrieb' in x
    else x)

    # entrepreneurship 카테고리에 owner, ceo/founder, ceo 귀속
    data['customer_job'] = data['customer_job'].str.replace('owner', 'entrepreneurship')
    data['customer_job'] = data['customer_job'].str.replace('ceo/founder', 'entrepreneurship')
    data['customer_job'] = data['customer_job'].str.replace('ceo', 'entrepreneurship')

    # education 카테고리에 귀속
    data['customer_job'] = data['customer_job'].str.replace('highereducation(college&university)', 'education')
    data['customer_job'] = data['customer_job'].str.replace('institute&academy', 'education')
    data['customer_job'] = data['customer_job'].str.replace('educator', 'education')

    # purchasing 카테고리에 buyer, purchaser 귀속
    data['customer_job'] = data['customer_job'].str.replace('buyer', 'purchasing')
    data['customer_job'] = data['customer_job'].str.replace('purchaser', 'purchasing')

    # other 카테고리에 sonstiges, otro, others 귀속
    data['customer_job'] = data['customer_job'].str.replace('sonstiges', 'other')
    data['customer_job'] = data['customer_job'].str.replace('otro', 'other')
    data['customer_job'] = data['customer_job'].str.replace('others', 'other')
    data['customer_job'] = data['customer_job'].str.replace('egyéb', 'other')
    data['customer_job'] = data['customer_job'].str.replace('othertores', 'other')

    # generalmanager 카테고리에 gm, general management 귀속
    data['customer_job'] = data['customer_job'].str.replace('gm', 'generalmanager')
    data['customer_job'] = data['customer_job'].str.replace('generalmanagement', 'generalmanager')

    # mediaandcommunication 카테고리에 medien und kommunikation, médiaéskommunikáció 귀속
    data['customer_job'] = data['customer_job'].str.replace('medienundkommunikation', 'mediaandcommunication')
    data['customer_job'] = data['customer_job'].str.replace('médiaéskommunikáció', 'mediaandcommunication')
    data['customer_job'] = data['customer_job'].str.replace('mediaecomunicazione', 'mediaandcommunication')

    # finance 카테고리에 pénzügy, finanzen 귀속
    data['customer_job'] = data['customer_job'].str.replace('pénzügy', 'finance')
    data['customer_job'] = data['customer_job'].str.replace('finanzen', 'finance')

    # reseller 카테고리에 revendedor 귀속
    data['customer_job'] = data['customer_job'].str.replace('revendedor', 'reseller')

    value_counts_ = data['customer_job'].value_counts()
    to_replace = value_counts_[value_counts_ <= 2].index
    data['customer_job'] = data['customer_job'].apply(lambda x: 'other' if x in to_replace else x)

    return data


def customer_position(data: pd.DataFrame):
    data['customer_position'] = data['customer_position'].fillna('unknown')
    data['customer_position'] = data['customer_position'].str.lower()
    data['customer_position'] = data['customer_position'].str.replace('-', '')
    data['customer_position'] = data['customer_position'].str.replace('_', '')
    data['customer_position'] = data['customer_position'].str.replace(' ', '')

    # entry level 카테고리에 intern 귀속
    data['customer_position'] = data['customer_position'].str.replace('intern', 'entrylevel')

    # other 카테고리에 others 귀속
    data['customer_position'] = data['customer_position'].str.replace('others', 'other')

    # trainee 카테고리에 unpaid 귀속
    data['customer_position'] = data['customer_position'].str.replace('unpaid', 'trainee')

    # Commercial end-user를 end user 카테고리로 묶기
    data['customer_position'] = data['customer_position'].str.replace('commercial', '')

    # ceo 카테고리에 founder와 ceo 모두 귀속
    data['customer_position'] = data['customer_position'].apply(lambda x: 'ceo' if 'founder' in x else x)

    # no influence = not applicable, none, other 하나로 묶기
    data['customer_position'] = data['customer_position'].str.replace('notapplicable', 'noinfluence')
    data['customer_position'] = data['customer_position'].str.replace('none', 'noinfluence')
    data['customer_position'] = data['customer_position'].str.replace('other', 'noinfluence')

    value_counts_ = data['customer_position'].value_counts()
    to_replace = value_counts_[value_counts_ <= 1].index
    data['customer_position'] = data['customer_position'].apply(lambda x: 'other' if x in to_replace else x)

    return data

In [7]:
def data_processing(df: pd.DataFrame) -> pd.DataFrame:
    df['historical_existing_cnt'].fillna(0, inplace=True)

    df['ver_win_ratio_per_bu'].fillna(0, inplace=True)
    df['com_reg_ver_win_rate'].fillna(0, inplace=True)
    
    df = switch_country(df)
    df = inquery_types(df)
    df = customer_type(df)
    df = product_category(df)
    df = customer_job(df)
    df = customer_position(df)
    
    df.drop(['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver', 'ver_cus', 'ver_pro'],axis=1, inplace=True)
    
    return df

In [8]:
df_train = data_processing(df_train)
df_test = data_processing(df_test)

In [9]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [10]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    "product_category",
    "product_subcategory",
    "product_modelname",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [11]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [12]:
x_train = df_train.drop('is_converted', axis=1)
y_train = df_train['is_converted']

In [13]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

## Data Split

In [14]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.1,
    shuffle=True,
    random_state=100,
    stratify=df_train["is_converted"]
)

# LightGBM

In [15]:
lgb_param = {
    "objective": "binary",
    "metric": "average_precision",
    "random_state": CFG['SEED'],
    "boost_from_average":False,
    "scale_pos_weight" : 11,
    'n_estimators': 150,
    'num_leaves': 20
}

lgb_model = lgb.sklearn.LGBMClassifier(**lgb_param)

In [16]:
lgb_model.fit(x_train.fillna(0), y_train)

[LightGBM] [Info] Number of positive: 4365, number of negative: 49004
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2138
[LightGBM] [Info] Number of data points in the train set: 53369, number of used features: 22


In [17]:
lgb_pred = lgb_model.predict(x_val.fillna(0))
get_clf_eval(y_val, lgb_pred)

오차행렬:
 [[ 465   20]
 [ 303 5142]]

정확도: 0.9455
정밀도: 0.6055
재현율: 0.9588
F1: 0.7422


In [18]:
lgb_pred_proba = lgb_model.predict_proba(x_val.fillna(0))
lgb_pred_proba

array([[0.96346448, 0.03653552],
       [0.50756058, 0.49243942],
       [0.90867965, 0.09132035],
       ...,
       [0.99574171, 0.00425829],
       [0.76691877, 0.23308123],
       [0.98232405, 0.01767595]])

## CatBoost

In [19]:
catboost_param = {
    'learning_rate': 0.045, 
    'depth': 6,
    'loss_function': 'Logloss', 
    'eval_metric': 'F1',
    'random_seed': CFG['SEED'], 
    'verbose': 200, 
    'scale_pos_weight': 30,
    'iterations' : 4000,
    'early_stopping_rounds': 100,
}

catboost_model = CatBoostClassifier(**catboost_param)

In [20]:
cat_features = [i for i, col in enumerate(x_train.columns) if col in label_columns]

catboost_model.fit(x_train, 
          y_train,
          cat_features=cat_features, 
          eval_set=(x_val, y_val), 
          use_best_model=True)

0:	learn: 0.9121671	test: 0.9158466	best: 0.9158466 (0)	total: 168ms	remaining: 11m 10s
200:	learn: 0.9647323	test: 0.9645323	best: 0.9664348 (181)	total: 11.1s	remaining: 3m 30s
400:	learn: 0.9740179	test: 0.9721467	best: 0.9721467 (400)	total: 22.5s	remaining: 3m 21s
600:	learn: 0.9776924	test: 0.9738337	best: 0.9738337 (597)	total: 33.7s	remaining: 3m 10s
800:	learn: 0.9804387	test: 0.9753936	best: 0.9754269 (799)	total: 44.6s	remaining: 2m 58s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9761255116
bestIteration = 847

Shrink model to first 848 iterations.


<catboost.core.CatBoostClassifier at 0x7fa97130add0>

In [21]:
cat_pred_ = catboost_model.predict(x_val.fillna(0))
y_pred_bool = [eval(pred) for pred in cat_pred_]
get_clf_eval(y_val, y_pred_bool)

오차행렬:
 [[ 476    9]
 [ 456 4989]]

정확도: 0.9216
정밀도: 0.5107
재현율: 0.9814
F1: 0.6718


In [22]:
cat_pred_proba = catboost_model.predict(x_val.fillna(0), prediction_type='Probability')
cat_pred_proba

array([[0.84027756, 0.15972244],
       [0.25383329, 0.74616671],
       [0.92552316, 0.07447684],
       ...,
       [0.99667355, 0.00332645],
       [0.31870917, 0.68129083],
       [0.96150662, 0.03849338]])

# AdaBoost

In [23]:
ada_param = {
    'estimator': RandomForestClassifier(max_depth=6,
                                        n_estimators=200,
                                        random_state=CFG['SEED'],
                                        class_weight= {0:1, 1:17},
                                        criterion = 'entropy',
                                        n_jobs = -1),
    'n_estimators': 150,  
    'random_state': CFG['SEED'],
    'learning_rate': 0.8, 
}                                 

ada_model = AdaBoostClassifier(**ada_param)

In [24]:
ada_model.fit(x_train.fillna(0), y_train)

In [25]:
ada_pred = ada_model.predict(x_val.fillna(0))
get_clf_eval(y_val, ada_pred)

오차행렬:
 [[ 424   61]
 [  87 5358]]

정확도: 0.9750
정밀도: 0.8297
재현율: 0.8742
F1: 0.8514


In [26]:
ada_pred_proba = ada_model.predict_proba(x_val.fillna(0))
ada_pred_proba

array([[0.51828322, 0.48171678],
       [0.51300014, 0.48699986],
       [0.51925225, 0.48074775],
       ...,
       [0.53466456, 0.46533544],
       [0.49861486, 0.50138514],
       [0.52266327, 0.47733673]])

# AdaBoost, LightGBM, CatBoost를 통한 앙상블 모델

In [27]:
proba = lgb_pred_proba + cat_pred_proba + ada_pred_proba 
proba

array([[2.32202526, 0.67797474],
       [1.274394  , 1.725606  ],
       [2.35345506, 0.64654494],
       ...,
       [2.52707982, 0.47292018],
       [1.58424281, 1.41575719],
       [2.46649395, 0.53350605]])

In [28]:
final_result = proba[:, 0] <= proba[:, 1]

In [29]:
get_clf_eval(y_val, final_result)

오차행렬:
 [[ 472   13]
 [ 370 5075]]

정확도: 0.9354
정밀도: 0.5606
재현율: 0.9732
F1: 0.7114


# Submission

In [30]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
x_test.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,customer_job,lead_desc_length,...,product_subcategory,product_modelname,customer_position,response_corporate,expected_timeline,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner
0,0.0,21,2,0.073248,47466,9,0,53.0,17,3,...,349,699,37,43,271,0.001183,0.04984,10,25,278
1,0.25,149,3,0.0,5405,9,1,0.0,72,23,...,349,699,37,50,271,1.3e-05,0.0,12,62,437
2,1.0,145,2,0.0,13597,24,1,0.0,36,138,...,287,232,32,18,246,6e-05,0.131148,4,37,874
3,0.5,149,2,0.118644,17204,18,0,0.0,67,3,...,58,699,37,50,267,0.001183,0.04984,10,86,194
4,1.0,21,2,0.074949,2329,9,0,2.0,26,107,...,113,627,37,43,246,0.003079,0.064566,0,28,167


In [31]:
lgb_test_pred_proba = lgb_model.predict_proba(x_test.fillna(0))
cb_test_pred_proba = catboost_model.predict(x_test.fillna(0), prediction_type='Probability')
ada_test_pred_proba = ada_model.predict_proba(x_test.fillna(0))

In [32]:
final_res = lgb_test_pred_proba + cb_test_pred_proba + ada_test_pred_proba 

In [33]:
final = final_res[:, 0] <= final_res[:, 1]
final

array([ True,  True, False, ..., False, False,  True])

In [34]:
sum(final)

1935

In [35]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = final

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)