# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

### 데이터 셋 읽어오기

In [None]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [None]:
df_train.head() # 학습용 데이터 살펴보기

## 2. 데이터 전처리

In [None]:
# df_train = df_train.drop('bant_submit', axis=1)
# df_test = df_test.drop('bant_submit', axis=1)
# df_train = df_train.drop('inquiry_type', axis=1)
# df_test = df_test.drop('inquiry_type', axis=1)
##################################################
# df_train = df_train.drop('ver_win_ratio_per_bu', axis=1)
# df_test = df_test.drop('ver_win_ratio_per_bu', axis=1)
# df_train = df_train.drop('business_area', axis=1)
# df_test = df_test.drop('business_area', axis=1)
# df_train = df_train.drop('ver_cus', axis=1)
# df_test = df_test.drop('ver_cus', axis=1)
# df_train = df_train.drop('idit_strategic_ver', axis=1)
# df_test = df_test.drop('idit_strategic_ver', axis=1)
# df_train = df_train.drop('it_strategic_ver', axis=1)
# df_test = df_test.drop('it_strategic_ver', axis=1)
# df_train = df_train.drop('id_strategic_ver', axis=1)
# df_test = df_test.drop('id_strategic_ver', axis=1)
####################################################
df_train[~df_train['customer_country'].str.contains('@', na=False)]
df_test[~df_test['customer_country'].str.contains('@', na=False)]
df_train['response_corporate'] = df_train['response_corporate'].replace('LGEBT', 'LGEPT')
df_train = df_train[df_train['business_unit'] != 'CM']
df_test = df_test[df_test['business_unit'] != 'CM']

In [None]:
df_train.loc[df_train['ver_cus'] == 1, 'customer_type'] = 'End-user'
df_test.loc[df_test['ver_cus'] == 1, 'customer_type'] = 'End-user'
df_train.loc[df_train['customer_type'] == 'End-Customer', 'customer_type'] = 'End-user'
df_test.loc[df_test['customer_type'] == 'End-Customer', 'customer_type'] = 'End-user'
df_train.loc[df_train['customer_type'] == 'End Customer', 'customer_type'] = 'End-user'
df_test.loc[df_test['customer_type'] == 'End Customer', 'customer_type'] = 'End-user'
df_train.loc[df_train['customer_type'] == 'Commercial end-user', 'customer_type'] = 'End-user'
df_test.loc[df_test['customer_type'] == 'Commercial end-user', 'customer_type'] = 'End-user'
df_train.loc[df_train['customer_type'] == 'Specifier/ Influencer', 'customer_type'] = 'Influencer'
df_test.loc[df_test['customer_type'] == 'Specifier/ Influencer', 'customer_type'] = 'Influencer'
df_train.loc[df_train['customer_type'] == 'Specifier / Influencer', 'customer_type'] = 'Influencer'
df_test.loc[df_test['customer_type'] == 'Specifier / Influencer', 'customer_type'] = 'Influencer'
df_train.loc[df_train['customer_type'] == 'Others', 'customer_type'] = 'Other'
df_test.loc[df_test['customer_type'] == 'Others', 'customer_type'] = 'Other'
df_train.loc[df_train['customer_type'] == 'Home Owner', 'customer_type'] = 'Homeowner'
df_test.loc[df_test['customer_type'] == 'Home Owner', 'customer_type'] = 'Homeowner'
df_train.loc[df_train['customer_type'] == 'Home Owner', 'customer_type'] = 'Homeowner'
df_test.loc[df_test['customer_type'] == 'Home Owner', 'customer_type'] = 'Homeowner'
df_train['customer_type'].replace({
    'Installer/Contractor': 'Installer',
    'Distributor': 'Dealer/Distributor',
    # 추가적으로 중복된 값이 있다면 여기에 추가하여 처리
}, inplace=True)
df_test['customer_type'].replace({
    'Installer/Contractor': 'Installer',
    'Distributor': 'Dealer/Distributor',
    # 추가적으로 중복된 값이 있다면 여기에 추가하여 처리
}, inplace=True)

In [None]:
import pycountry

# 나라 채울 리스트 
countries = []

# 나라이름 불러오기
for country in pycountry.countries:
    countries.append(country.name)

# 나라이름에 이메일 들어가는 관측치 삭제
df_train = df_train[~df_train['customer_country'].str.contains('@', na=False)]

# 아예 결측치 인 값에 슬래시 두개 넣기
df_train['customer_country'][df_train['customer_country'].isna()] = '//'
df_test['customer_country'][df_test['customer_country'].isna()] = '//'

# , 를 /로 대치하고 맨 뒤에있는거 그냥 부름
df_train['country'] = df_train['customer_country'].str.replace(',', '/').str.split('/').map(lambda x: x[-1]).str.strip()
df_test['country'] = df_test['customer_country'].str.replace(',', '/').str.split('/').map(lambda x: x[-1]).str.strip()

# 기준 나라에 있는 이름이면 이름 불러오고 아니면 ''불러오기
df_train['customer_country2'] = np.where(df_train['country'].isin(countries), df_train['country'], '')
df_test['customer_country2'] = np.where(df_test['country'].isin(countries), df_test['country'], '')

# 나라의 딕셔너리 구성
country_dict = {"LGERA": "Russia",
"LGEUR": "Ukraine",
"LGEAP": "Australia",
"LGECH": "China",
"LGEHK": "China",
"LGEIL": "India",
"LGEIN": "Indonesia",
"LGEJP": "Japan",
"LGEKR": "Korea",
"LGEML": "Malaysia",
"LGEPH": "Philippines",
"LGESL": "Singapore",
"LGETT": "Taiwan, Province of China",
"LGETH": "Thailand",
"LGEAR": "Argentina",
"LGECZ": "Czechia",
"LGEFS": "France",
"LGEDG": "Germany",
"LGEAG": "Austria",
"LGEHS": "Greece",
"LGEMK": "Hungary",
"LGEIS": "Italy",
"LGEBN": "Netherlands",
"LGEPL": "Poland",
"LGEPT": "Portugal",
"LGERO": "Romania",
"LGEES": "Spain",
"LGESW": "Sweden",
"LGEUK": "United Kingdom",
"LGEAS": "Algeria",
"LGEEG": "Egypt",
"LGELF": "Jordan",
"LGEMC": "Morocco",
"LGESA": "The Republic of South Africa",
"LGEGF": "United Arab Emirates",
"LGEAF": "United Arab Emirates",
"LGETK": "Turkiye",
"LGECI": "Canada",
"LGEMX": "Mexico",
"LGEMS": "Mexico",
"LGEUS": "United States",
"LGECL": "Chile",
"LGECB": "Colombia",
"LGEPS": "Panama",
"LGEPR": "Peru",
"LGESJ": "Saudi Arabia",
"LGESP": "Brazil",
"LGEEF": "Kenya",
"LGEYK": "Palestine, State of",
"LGEEB": "en_EU", 
"LGEVH": "Viet Nam",
"LGELA": "Latvia",
"LGEIR": "Iran, Islamic Republic of",
"LGEBT": "Portugal"
               }

# 지사에 맞게 갑슬 넣어주기
df_train['customer_country3'] = np.where(df_train['customer_country2'] == '',
                                         df_train['response_corporate'].apply(lambda x: country_dict[x]),
                                         df_train['customer_country2'])
df_test['customer_country3'] = np.where(df_test['customer_country2'] == '',
                                         df_test['response_corporate'].apply(lambda x: country_dict[x]),
                                         df_test['customer_country2'])

In [None]:
# expected_timeline
# less than 3 months
change_list = ['less than 3 months',
'3 months',
'less_than_3_months',
'less than 3 months ,meeting with the customer for the more details and tentative boq will ne 32 and 43',
'less than 3 months. customer not answered . to call back',
'less than 3 months- outdoor led requiment',
'duplicate lead - il220100042906. discussed with client. their vc hall is under development. onequick details mailed to client. they will call us for demo and purchase finalization once their vc place is ready.',
'duplicate lead',
'same as lead no  il220300046498 hence dropping. duplicate lead.',
'duplicate lead - il220100042906. discussed with client. their vc hall is under development. onequick details mailed to client. they will call us for demo and purchase finalization once their vc place is ready.',
'duplicate lead - il220100042906. less than 3 months'               
]
for item in change_list:
    df_train.loc[df_train['expected_timeline'] == item, 'expected_timeline'] = 'less than 3 months'
    df_test.loc[df_test['expected_timeline'] == item, 'expected_timeline'] =  'less than 3 months'

# 3 months ~ 6 months
change_list = [
'less than 6 months',
'less then 6 months',
'3 months ~ 6 months',
'3_months_~*6_months'
]
for item in change_list:
    df_train.loc[df_train['expected_timeline'] == item, 'expected_timeline'] = '3 months ~ 6 months'
    df_test.loc[df_test['expected_timeline'] == item, 'expected_timeline'] =  '3 months ~ 6 months'

# 6 months ~ 9 months
change_list = [
'6 months ~ 9 months',
'6_months*~_9_months'        
]
for item in change_list:
    df_train.loc[df_train['expected_timeline'] == item, 'expected_timeline'] = '6 months ~ 9 months'
    df_test.loc[df_test['expected_timeline'] == item, 'expected_timeline'] =  '6 months ~ 9 months'

# 9 months ~ 1 year
change_list = [
'9 months ~ 1 year',
'9 months - 1 year',
'9_months*~*1_year'     
]
for item in change_list:
    df_train.loc[df_train['expected_timeline'] == item, 'expected_timeline'] = '9 months ~ 1 year'
    df_test.loc[df_test['expected_timeline'] == item, 'expected_timeline'] =  '9 months ~ 1 year'

# more than a year
change_list = [
'more than a year',
'more_than_a_year'     
]
for item in change_list:
    df_train.loc[df_train['expected_timeline'] == item, 'expected_timeline'] = 'more than a year'
    df_test.loc[df_test['expected_timeline'] == item, 'expected_timeline'] =  'more than a year'
    
# others
change_list = [
'not require',
'not responding',
'more then 3 months',
'not responding to calls',
'being followed up' 'being followed up.',
'repeated inquiry from client. he was just inquiring but have not confimred on purchase. we have tried mulitple times but client is not interested in buying.',
'client not interested in product.. receing call and not answering properly',
'drop, not intrested',
'drop, now not intrested'
]
for item in change_list:
    df_train.loc[df_train['expected_timeline'] == item, 'expected_timeline'] = np.nan
    df_test.loc[df_test['expected_timeline'] == item, 'expected_timeline'] = np.nan

In [None]:
# customer_position
# CEO / Founder
change_list = ['ceo/founder', 'ceo/fundador', 'founder', 'entrepreneurship']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'CEO / Founder'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'CEO / Founder'
# C-Level Executive
change_list = ['president',  'the big boss', 'chairman', 'chief executive officer', 'c-level executive', 'co-founder', 'co-founder', 'c-levelexecutive', 'leadership/executive office/owner', 'administrative']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'C-Level Executive'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'C-Level Executive'
# Vice President
change_list = ['vp', 'vice president', 'vicepresident']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Vice President'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Vice President'
# Partner
change_list = ['partner', 'business partner']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Vice President'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Vice President'
# Director
change_list = ['principal & director', 'director cum faculty at gaining apex coaching centre', 'business unit director', 'director']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Director'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Director'
# Manager
change_list = ['av management', 'manager',  'gerente']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Manager'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Manager'
# Entry Level
change_list = ['entry level', 'entrylevel']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Entry Level'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Entry Level'
# Others
change_list = ['other - please specify - cedia association', 'others', 'other']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Others'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Others'
# Educator
change_list = ['chemistry teacher', 'education professional', 'education', 'teacher', 'senior lecturer', 'science teacher',  'educator', 'physics and mathematics teacher', 'physics teacher', 'maths lecturer', 'professor of mathematics', 'guest faculty', 'physics faculty', 'math and physics teacher' 'english trainer for ielts,toefl,pte,gre,sat exams.', 'academic specialist', 'neet/ olympiad expert faculty', 'quantitative aptitude faculty', 'professional trainer', 'associate professor in electronics engg', 'prof.', 'professor', 'asst prof.', 'associate professor', 'assistant professor of enlish', 'assistant professor'
, 'career coach', 'teacher/middle school coordinator', 'academic coordinator/ post graduate teacher (accountancy, business studies)/ tgt (ict)', 'hon dean', 'principal at oxford integrated pu science college','educator', 'academic coordinator/ post graduate teacher (accountancy, business studies)/ tgt (ict)', 'assistant professor', 'math and physics teacher', 'professor', 'pgt chemistry',  'asst prof.']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Educator'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Educator'
# Consultant
change_list = ['consultant', 'consulting', 'architecture/consult', 'architect/consultant']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Consultant'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Consultant'
# Sales
change_list = ['business development/sales', 'sales', 'subsidiary sales (ise)']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Sales'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Sales'
# Doctor
change_list = ['tierarzt', 'surgery professional', 'radiology professional', 'főorvos', 'hospital', 'medical imaging specialist']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Hospital'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Hospital'
# Specifier / Influencer
change_list = ['técnico',  'technical', 'engineering', 'installer']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Technical'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Technical'
# Research
change_list = ['pathologist', 'research', 'market intelligence/research']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Research'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Research'
# Decision-Maker
change_list = ['decision maker', 'decision-maker',  'commercial consultant', 'decision influencer', 'decision-influencer']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Decision-Maker'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Decision-Maker'
# Software / Solution Provider
change_list = ['lider de desarrollo', 'software /solution provider']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Software / Solution Provider'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Software / Solution Provider'
# Manufacture
change_list = ['manufacturer',  'medical device manufacturer']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Manufacture'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'Manufacture'
# Customer
change_list = ['end-user', 'customer']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'End-user'
    df_test.loc[df_test['customer_position'] == item, 'customer_position'] =  'End-user'
# Error
change_list = ['none', 'bulgaria', 'genel müdür', 'proprietário(a)', 'mindenes', 'pgt physics', 'exhibitiontv', 'no influence', 'not applicable', 'unpaid', 'this is a consume display requirement for home purpose.']
for item in change_list:
    df_train.loc[df_train['customer_position'] == item, 'customer_position'] = 'Error'

In [None]:
# inquiry_type
# Quotation or purchase consultation
change_list = ['Quotation or purchase consultation', 'quotation_or_purchase_consultation', 'Quotation or Purchase Consultation', 'Quotation or Purchase consultation', 'Purchase or Quotation', 'quotation_', 'Request for quotation or purchase', 'Purchase']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Quotation or Purchase Consultation'
# Others
change_list = ['others', 'Others', 'other_' , 'Etc.' , 'Other', 'ETC.', 'other']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] =  'Others'
    df_test.loc[df_test['inquiry_type'] == item, 'inquiry_type'] =  'Others'
# Usage or Technical Consultation
change_list = ['Usage or technical consultation', 'Technical Support', 'usage or technical consultation', 'usage_or_technical_consultation', 'Usage or Technical Consultation', 'technical_consultation', 'Request for technical consulting', 'Technical Consultation', 'technical']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Usage or Technical Consultation'
    df_test.loc[df_test['inquiry_type'] == item, 'inquiry_type'] = 'Usage or Technical Consultation'
# Price and Solution
change_list = ['tôi cần tham khảo giá và giải pháp từ LG', 
'Solicito apoyo para realizar cotizacion de los dispositivos que ofrecen en la solución\xa0One Quick:\xa0',
'Vui lòng báo giá giúp mình sản phẩm đo thân nhiệt Xin cảm ơn']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Price and Solution'
# Sales inquiry
change_list = ['Sales Inquiry', 'sales', 'Sales inquiry']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Sales Inquiry'
# Event inquiry
change_list = ['Evento_SdelEstero', 'Event Inquiry']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Event Inquiry'
# Education Purpose
change_list = ['teach', 'for school', 'EDUCATIONAL EQUIPMENTS']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Education'
# Specific Product Information
change_list = ['estoy buscando para Ecuador este producto LG MAGNIT micro LED, para un cliente de 138 pulgadas, con envió marítimo.',
'Hola me pueden cotizar 19 pantallas interactivas de 100 pulgadas entregadas en Guayaquil -Ecuador.',
'display product', 'TV interactive', 'Display Textbook and photos', 'Hotel TV products',
'LED Signage', 'Video Wall', 'IDB', 'High inch 86 / 98 or 110' , 'AIO', 'VRF', 'Standalone', 'window facing product', 'Hospital TV', 'Pantallas Interactivas para Clinicas(**진료소용 대화형 화면**)', 
'Preciso de um monitor médico para radiografia convencional e tomogrtafia.', 'Probeam precio', 'One Quick:Flex', 'Pantallas Interactivas para Clinicas']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Specific Product Information'
# Product Information
change_list = ['Product Information', 'i want to know the details about it', 
'Toi muon tim hieu thong tin ky thuat, gia ca cua sp de su dung', 
'first Info and pricing']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Product Information'
# Not Specified
change_list = ['Intégrateur historique du George V', '(Select ID_Needs)', 'Needs', 'Digital platform', 'Not specified']
for item in change_list:
    df_train.loc[df_train['inquiry_type'] == item, 'inquiry_type'] = 'Not Specified'

In [None]:
# customer_job
# Accounting
change_list = ['accounting','account exec/manager', 'account management', 'accounts payable' ]
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'accounting'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'accounting'
# Administrative
change_list = ['adminisztráció','administración','amministrativo','it administrator','project administrator', 
'administration', 'admin assistant','admin', 'administrative assistant', 'office manager','administrative',
               'facility administrator', 'it administrator', 'project administrator'
               'imaging administrator', 'it admin',  'systems administrator', 'network administrator',
'platform administrator',  'pacs administrator']

for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'administrative'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'administrative'
    
# Arts and Design
change_list = ['művészet_és_design',
'arte y diseño',
'arte_e_design',
'exhibition / convention center',
'graphic design',
'museum / gallery',
'kreation und design', 'kreation_und_design', 
'designere / budget',
'sign company',
'interior stylist', 'project sales/manage',
       'digital display vs signage need', 'master mind', 'genera manager',
       'var', 'sho lyrics', 'hardware selection', 
       'replacement tv', 'guestroom tv', 'photos', 'developer',
       'signage for an attraction',    
'sho lyrics', 
'photos',                    
'colorist',
'architect ass interiores', 
'art and design',    
'designers',  
'inquiry-to-buy/contact-us test', 
'photographer'          
'design',
'design/decision maker',          
'signage manager', 
'arts_and_design',
'interior designer',
'designer, creative technologist',
'design and install',                                                   
'designer/installer',              
'creative director',
 'lead designer',
'designer, producer',    
'sliding pictures of beauty salon',  
'producer',                                
'fashion', 
'design and installation company', 
'art installation', 
'design and provide equipment',  
'arts and design', 
'graphic/color art',
'designer',      
'artist, lead on equipment selection',
'artist, lead on equipment selection',
]
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'arts and design'
    df_test.loc[df_test['customer_job'] == item,'customer_job'] =  'arts and design'
# Business Development
change_list = ['business_development', 
'business owner', 
'business development'
]
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'business development'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'business development'
# Community and Social Services
change_list = ['community_and_social_services',
'community and social services']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'community and social services'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'community and social services'
# consulting
change_list = ['content creation, eq consultant', 
'arquitecto/consultor',  
'technology consultant',  
'design consultant',
'consultent',
'consultant,cabinet fabricator',
'consultant',
'consulting', 
'consultant / purchaser']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'consulting'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'consulting'
# Curation
change_list = ['quotation curator', 'curation']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'curation'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'curation'
# Education
change_list = ['higher education (college & university)',
'institute & academy',
'teaching',
'educator',     
'instructor', 
'teacher',
'education','k12 school']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'education'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'education'
# Engineering
change_list = ['system engineer', 
 'systems engineer',
'engineering, design, and install',
'director of engineering', 
'chief of engineering',
'principal engineer',
'hardware design engineer',
'senior design engineer', 
'designer/ engineer',
'engineering',
'solution engineer', 
'engineering & technical',
'sales engineering',
'project engineer',
 'lead engineer', 
'engineering & technical executive', 
'design engineer',
'chief engineer',
'engineer',
'engineering director']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'engineering'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'engineering'
    
# Entrepreneurship
change_list = ['entrepreneurship']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'entrepreneurship'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'entrepreneurship'
    
# Finance
change_list = ['finanzas',  
'finanzen',
'finance executive', 
'pénzügy', 
'director of finance',   
'finance']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'finance'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'finance'

        
# medical imaging specialist
change_list = ['spécialiste_en_imagerie_médicale','medical imaging  specialist','medical imaging specialist']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'medical imaging specialist'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'medical imaging specialist'

# clinical specialist
change_list = ['clinical specialist','clinic']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'clinical specialist'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'clinical specialist'

# radiology professional
change_list = ['profesional de radiología',  'radiology  professional',
               'radiology_professional','radiology professional',]
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'radiology professional'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'radiology professional'

    
# Healthcare Services
change_list = [
'cirugano',
'tierarzt',
'profesional de cirugía', 'chirurgien','surgery professional\u200b',
'healthcare professionals', 
'doctor',           
'healthcare services', 
'medical solution provider', 
'főorvos',   
'surgery professional',
'mental health',
'healthcare_services','healthcare']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'healthcare services'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'healthcare services'
    
# Human Resources
change_list = ['resource manager',
'hr',    
'human_resources',
'hr posting',  
'human resources']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'human resources'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'human resources'

# information technology
change_list = ['information technology\u200b',
'si',
 'help desk / desktop services',
'application development',
'software developer', 

'cloud / mobility', 
'developer',
 'software solution', 
'collaboration & web apps',
'it',
'it tech.',
'it support',
'it integrator', 
'director it',
'site manager',
'it specialist',
'it director',
'it manager',
'director,it',
'director of it',
"i'm directing it",
'it dairector',
'it department',
'it project lead',
'it hardware technician',
'it - information technology',
'it/software',
'office it',
'computing & it',
'system installer',
'systems designer',

'systems design',                
'information_technology', 
'deputy cio', 
'technical director',
 'informatics, touch capability',
                             
'helpdesk specialist',
 'head of technology', 
'av technician', 
'technology designer',
'av tech',
'tech service',      
'information technology', 
'developer/property',
'system designer, integrator',
]
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'information technology'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'information technology'
    
# Legal
change_list = ['legal']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'legal'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'legal'
    
# Marketing
change_list = ['ownner-marketing director', 
'product marketing',
'technical marketing',
'marketing operations', 'marketing executive',
'store promotions',
'signage for an attraction',
'event marketing', 
'field marketing', 
'advertising',  
'advertising and promotions team', 
 'marketing coordinator',
'marketing']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'marketing'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'marketing'
    
# Media and Communication
change_list = ['media and communication', 
'strategic communications',
'media_e_comunicazione',
'média_és_kommunikáció',
'medien_und_kommunikation',
'medios_de_comunicación',
'media_and_communication',
'media and communications',            
'broadcasting & media', 
'film production', 
'tv studio manager']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'media and communication'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'media and communication'
    
#  Military and Protective Services
change_list = ['military_and_protective_services',
'commander', 
'military and protective services']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'military and protective services'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'military and protective services'
    
# Operations
change_list = ['regional director of operations',
'director of operations',
'strategy & operations specialist', 
'operaciones',  
'üzemeltetés', 
'ops mgr',
'equipment planner', 
 'operations executive', 
'sales operations',
'facilities and operations', 
 'parts coordinator', 
 'projection manager',
'maintenance',     
'hotel tv', 
'operations manager', 
'equipment custodian',
 'operations']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'operations'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'operations'
    
# Product Management
change_list = ['main end user of the product',
'recommend (you recommend specific products or technologies for the solution)',
'global lead of production', 
'product_management',
 'designer/pm/gc',
'product owner',       
'product management']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'product management'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'product management'

# Program and Project Management
change_list = ['digital project manager',
'gestión_de_proyectos',
'program-_és_projektmenedzsment', 
'programm- und projektmanagement',
'projektmenedzsment\tprogram and project management', 
'project manage',   
'project designer', 
'project sales/manage',
'genera manager',
'projectr mgmt',
'design/install/training/support', 
'programm-_und_projektmanagement', 
'program_and_project_management', 

'planner',    
'av project manager',     
'program_and_project_manager',    
'designer/ project manager',
'a/v project manager', 
'project manager/designer',
'r&d project manager',
'general manager - project manager', 
'program directors',
'project manager / principal',       
'display our products', 
'project manager / estimator',
'pm',   
'project coordinator',
'project lead', 
'project facilitator', 
'program and project management',
'project manager']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'program and project management'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'program and project management'
    
# Purchasing
change_list = ['drop, purchase maxhub',             
'general manager- purchaser',
'purchasing', 
'buyer, coordinating',
'replacement tv', 'guestroom tv', 
'purchase dept',
'designer purchaser', 'purchsing', 
'design/purchaser',
'purchasing supervisor',
'procurement specialist',
'procurment',
'installation and purchaser', 
'purchasers',     
 'sourcing/procurement',
'purchasing director',            
'sourcing', 
'purchase and install',
'purchasing authority',     
'purchasing coordinator', 
'obtain quotes, process purchase', 
'planner/purchaser',
'procurement',
'director purchaser',
'purchasing manager',
'purchaser', 
'purchase',  
'purchaser, it and installer',
  'purchasing agent']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'purchasing'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'purchasing'
    
# Quality Assurance
change_list = ['quality_assurance', 
'testing and troubleshooting', 
'quality assurance']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'quality assurance'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'quality assurance'
    
# Real Estate
change_list = ['property owner', 
'building owner',                
'architect/owner',       
'real estate']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'real estate'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'real estate'
    
# Research
change_list = ['product researcher', 
'project researcher',  
'research products and prices', 
 'research & development',  
'research and developement',
'product research',      
'research and instalaltion',
'research',
'research/install', ]
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'research'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'research'
    
# Sales
change_list = ['car dealership',
'értékesítés', 
'vendite',
'recommender',  
'vertrieb', 
'field / outside sales',
'sourcing / procurement', 
'distributor quotation',  
'distribuidor', 
'sales', 
'sale', 
'installer/sales rep',
'sales rep',   
'sales executive',      
 'salesman', 
 'retailer/installer', 
 'sales manager', 
'bidder']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'sales'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'sales'
    
# Support
change_list = ['support/facilitator, designer',
'assist in serving food', 
'department secretary',
'service coordinator', 
'post install support and service',
'support']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'support'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'support'
    
# Others
change_list = ['other','egyéb','autres',
'n.a', 'sonstiges', 'altro', 'var', 
'contributor',         
'otros', 'no requirment',
'others', 'otro']
for item in change_list:
    df_train.loc[df_train['customer_job'] == item, 'customer_job'] = 'other'
    df_test.loc[df_test['customer_job'] == item, 'customer_job'] =  'other'

In [None]:
# 생성한 열 삭제
df_train = df_train.drop(['customer_country2', 'country'
                          ,'customer_country.1'
                          ,'customer_country'], axis = 1)
df_test = df_test.drop(['customer_country2', 'country'
                        ,'customer_country.1'
                        ,'customer_country'], axis = 1)

In [None]:
# 생성한 열 삭제
df_train = df_train.drop(['ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu','id_strategic_ver','it_strategic_ver',
                          'idit_strategic_ver','com_reg_ver_win_rate', 'product_subcategory' , 'product_category','product_modelname'
                          ], axis = 1)
df_test = df_test.drop(['ver_cus', 'ver_pro', 'ver_win_rate_x', 'ver_win_ratio_per_bu','id_strategic_ver','it_strategic_ver',
                          'idit_strategic_ver','com_reg_ver_win_rate', 'product_subcategory' , 'product_category','product_modelname'
                          ], axis = 1)

### 레이블 인코딩

In [None]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [None]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
#     "product_cate0gory",
#     "product_subcategory",
#     "product_modelname",
#     "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    "business_area",
    "business_subarea",
    "customer_country3",

#     "customer_country",


]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [None]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [None]:
# df_train = df_train.drop(['ver_cus'], axis=1)
# df_test = df_test.drop(['ver_cus'], axis=1)

### Imputer

In [None]:
# imp_test = df_test.drop(["id"], axis=1)

In [None]:
# from sklearn.impute import KNNImputer

# #임퓨터 선언(5개의 평균으로 계산하겠다)
# imputer=KNNImputer(n_neighbors=5)

# #임퓨터를 사용하여 filled_train으로 저장 이후 같은 임퓨터를 사용할때는 imputer.transform()으로 사용하면됨
# filled_train=imputer.fit_transform(df_train)
# filled_test=imputer.transform(imp_test)

# #사용하면 array값으로 나오기때문에 dataframe으로 바꿔주고 컬럼을가져옴
# df_train=pd.DataFrame(filled_train, columns=df_train.columns)
# imp_test=pd.DataFrame(filled_test, columns=imp_test.columns)
# imp_test['id'] = df_test['id']

In [None]:
# df_test = imp_test.copy()

## 2. 학습 데이터 분리

In [None]:
# # NaN 값을 새로운 범주로 추가할 열의 이름 목록
# columns_to_set_categories = ['business_unit', 'customer_type',
#                              'enterprise', 'customer_job', 'product_category',
#                              'product_subcategory', 'product_modelname',
#                              'customer_country.1', 'customer_position',
#                              'response_corporate', 'expected_timeline',
#                              'business_area', 'business_subarea',
#                              'customer_country2']

# # 각 열에 대해 NaN을 새로운 범주로 추가
# for col in columns_to_set_categories:
#     df_train[col] = df_train[col].astype('category').cat.add_categories(['NaN'])
#     df_test[col] = df_test[col].astype('category').cat.add_categories(['NaN'])

# # NaN 값을 새로운 범주로 설정
# df_train[columns_to_set_categories] = df_train[columns_to_set_categories].fillna('NaN')
# df_test[columns_to_set_categories] = df_test[columns_to_set_categories].fillna('NaN')

In [None]:
X = df_train.drop("is_converted", axis=1)
y = df_train['is_converted']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
    random_state=1996,
)

## 3. 모델 학습

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, plot_metric
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import optuna
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, f1_score
import seaborn as sns
import warnings;warnings.filterwarnings('ignore')
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
# f1 score를 사용자 정의 스코어 함수로 만듭니다.
f1_scorer = make_scorer(f1_score, pos_label=1)

### 3-1. 모델 정의 

### Baseline

In [None]:
model = DecisionTreeClassifier(
    random_state=1996)

### Optuna

In [None]:
import optuna

def objective(trial):
    params = {
        'n_estimators' : trial.suggest_int('n_estimators',1,1000),
        "max_depth":trial.suggest_int('max_depth',3,50),
        "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.25, log=True), 
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        "min_child_weight" : trial.suggest_float('min_child_weight', 0.5,4),
        "min_child_samples" : trial.suggest_int('min_child_samples',1,100),
        "subsample" : trial.suggest_float('subsample', 0.4, 1),
        "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
        "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 64),
    }

    model = LGBMClassifier(**params, device = 'cpu', random_state=1996,verbose=-1)

    # KFold : 회귀모델 / StratifiedKFold : 분류모델
    # KFold 5
    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1996)
    cv = cross_val_score(model, X_train, y_train, scoring=f1_scorer,cv=skf).mean()
    
    return cv

In [None]:
%%time
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
# optuna가 시도했던 모든 실험 관련 데이터
study.trials_dataframe()

In [None]:
print('Best trial: score {}, \nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
# Hyperparameter Importances를 통해서 parameter를 고정시켜라.
# 그리고 나머지 것들을 진행시켜라.
optuna.visualization.plot_param_importances(study)

### 모델 학습

In [None]:
model.fit(X_train.fillna(0), y_train)

In [None]:
# best_params = {'n_estimators': 877, 'max_depth': 36, 'learning_rate': 0.22046021777746597, 'reg_alpha': 0.1984838782873106, 'reg_lambda': 0.4240818195673152, 'min_child_weight': 1.2928732579114495, 'min_child_samples': 12, 'subsample': 0.9918561669133269, 'subsample_freq': 4, 'colsample_bytree': 0.20661434788181832, 'num_leaves': 51}

In [None]:
best_params = study.best_params
best_model = LGBMClassifier(**best_params, random_state=1996)
best_model.fit(X_train, y_train)

In [None]:
# 위 feature importance를 시각화해봅니다.
importances = pd.Series(best_model.feature_importances_, index=list(X_train.columns))
importances = importances.sort_values(ascending=False)

plt.figure(figsize=(10,8))
plt.title("Feature Importances")
sns.barplot(x=importances, y=importances.index)
plt.show()

In [None]:
# Permutation importance 방법을 변수 선택에 이용해보겠습니다.
perm = PermutationImportance(best_model,        # 위에서 학습된 모델을 이용하겠습니다.
                             scoring = f1_scorer,        # 평가 지표로는 회귀문제이기에 negative rmse를 사용합니다. (neg_mean_squared_error : 음의 평균 제곱 오차)
                             random_state = 1996,
                             n_iter=3).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())    # valid data에 대해 적합시킵니다.

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(X_val.fillna(0))
get_clf_eval(y_val, pred)

In [None]:
pred = best_model.predict(X_val)
get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [None]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

### Optuna

In [None]:
# best_params = study.best_params
best_model = LGBMClassifier(
                           **best_params,
                           random_state=1996)

test_pred = best_model.fit(X,y).predict(x_test)
sum(test_pred) # True로 예측된 개수

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**