# 1. 라이브러리 설치 및 호출

In [12]:
!pip install vaderSentiment
!pip install catboost
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import imblearn
from imblearn.under_sampling import RandomUnderSampler
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from difflib import SequenceMatcher
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# 2. 데이터 호출 및 전처리

## [데이터 호출]


In [7]:
data = pd.read_csv("train.csv") # 학습용 데이터
submission_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터

## [customer_country.1 수정]

In [8]:
""" customer_country.1 => country_level1 / country_level2 / country_level3로 변환 후 customer_country.1 drop"""
split_columns = data['customer_country.1'].str.replace(' ', '').str.split('/', expand=True)
split_columns2 = submission_test['customer_country.1'].str.replace(' ', '').str.split('/', expand=True)
split_columns = split_columns.iloc[:, :3]
split_columns2 = split_columns2.iloc[:, :3]
split_columns.columns = ['country_level1', 'country_level2', 'country_level3', ]
split_columns2.columns = ['country_level1', 'country_level2', 'country_level3' ]

data = pd.concat([data, split_columns], axis=1)
submission_test = pd.concat([submission_test, split_columns2], axis=1)

data = data.drop('customer_country.1', axis=1)
submission_test = submission_test.drop('customer_country.1', axis=1)

## [customer_country 수정]

In [9]:
""" 공백 전부 제거 """
data['customer_country'] = data['customer_country'].str.replace(' ', '')
submission_test['customer_country'] = submission_test['customer_country'].str.replace(' ', '')


## [product modelname 수정]

In [10]:
def preprocess_product_modelname(df):
    # 'product_modelname' 열에서 괄호와 공백 제거
    df['product_modelname'] = df['product_modelname'].str.replace(r'\([^)]*\)', '')
    df['product_modelname'] = df['product_modelname'].str.replace(' ', '')

    # 대체할 데이터 목록
    data_to_replace = [
        'TotalCareThruOne-stopService',
        'Duetobudgettheyhaveholdtherequiement',
        'RequiredAfter3Months',
        'WantSplitAC',
        'OnlyInstallationNeed',
        'Architect,WeareMeetingforEnqiryGeneration',
        'PassedontoFixxydistribution',
        'all',
        'UltraFineErgo',
        'ThisisbeingdealtwithbyLGGermany',
        'SuperSignMediaEditor',
        'SuperSignWB',
        'ALLSurgical',
        'Surgical',
        'SuperSignCMS',
        'diagnostic',
        'LGESLExportteamisfollowupthelead',
        'Video',
        'AllMedicalDisplays',
        'DigitalX-rayDetectors',
        'SurgicalMonitors',
        'DiagnosticMonitors',
        'ClinicalReviewMonitors',
        'DiagnosticMonitor',
        'InquiryforwardedtoShaker',
        'LGUltraWide™',
        'LGMediaStudio',
        'AI/MachineLearning|Antennas,TransmittersandTowers|AudienceMeasurement|CamerasandLenses',
        'VerticalBrochure',
        'WhitePaper'
    ]
    # 'product_modelname' 열에서 해당 값들을 NaN으로 대체
    df['product_modelname'].replace(data_to_replace, np.nan, inplace=True)


preprocess_product_modelname(data)
preprocess_product_modelname(submission_test)

  df['product_modelname'] = df['product_modelname'].str.replace(r'\([^)]*\)', '')


## [expected timeline 수정]

In [13]:
""" expected timeline에서 긍/부정 요청사항 파악 후 sentimental 열 추가"""

nlp = spacy.load("en_core_web_lg")

analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    doc = nlp(text)

    spacy_score = sum([token.sentiment for token in doc]) / len(doc)

    vader_score = analyzer.polarity_scores(text)['compound']

    final_score = (spacy_score + vader_score) / 2

    if final_score >= 0.05: 
        return "Positive"
    elif final_score <= -0.05: 
        return "Negative"
    else:
        return "Neutral"


tmp = data['expected_timeline'].dropna().unique()
data['sentimental']=None
for text in tmp:
    indices = data[data['expected_timeline'] == text].index
    result = sentiment_analysis(text)
    data.loc[indices, 'sentimental'] = result

tmp = submission_test['expected_timeline'].dropna().unique()
submission_test['sentimental']=None
for text in tmp:
    indices = submission_test[submission_test['expected_timeline'] == text].index
    result = sentiment_analysis(text)
    submission_test.loc[indices, 'sentimental'] = result


In [14]:
""" expected timeline에서 동일한 뜻을 가진 문장 합치기 """
def preprocess_expected_timeline(df):
    replacements = {
        'less than 3 months': 'less than 3 months',
        'less_than_3_months': 'less than 3 months',
        'less than 3 months ,meeting with the customer for the more details and tentative boq will ne 32 and 43': 'less than 3 months',
        'less than 3 months- outdoor led requiment': 'less than 3 months',
        'less than 3 months. customer not answered . to call back': 'less than 3 months',
        'duplicate lead - il220100042906. less than 3 months': 'less than 3 months',
        '3 months ~ 6 months': '3 months ~ 6 months',
        '3_months_~_6_months': '3 months ~ 6 months',
        '9 months ~ 1 year': '9 months ~ 1 year',
        '9_months_~_1_year': '9 months ~ 1 year',
        '9 months - 1 year': '9 months ~ 1 year',
        'more than a year': 'more than a year',
        'more_than_a_year': 'more than a year',
        '6 months ~ 9 months': '6 months ~ 9 months',
        '6_months_~_9_months': '6 months ~ 9 months',
        'less than 6 months': 'less than 6 months',
        'less then 6 months': 'less than 6 months'
    }

    df['expected_timeline'].replace(replacements, inplace=True)

preprocess_expected_timeline(data)
preprocess_expected_timeline(submission_test)

## [customer_job, inquiry_type, customer_type, product_category, customer_position 수정]

In [15]:
""" 의미가 비슷한 문장 합치기 """
def similar(a, b):
    a_str = str(a) if isinstance(a, (str, float)) else ""
    b_str = str(b) if isinstance(b, (str, float)) else ""

    return SequenceMatcher(None, a_str, b_str).ratio()

def find_and_replace_similar(df, threshold=0.8):
    tmp =['customer_job','inquiry_type','customer_type','product_category','customer_position']
    for column in tmp:
        unique_values = df[column].unique()
        for i, value1 in enumerate(unique_values):
            for value2 in unique_values[i+1:]:
                similarity_score = similar(value1, value2)
                if similarity_score >= threshold:
                    df[column].replace(value2, value1, inplace=True)

find_and_replace_similar(data, threshold=0.8)
find_and_replace_similar(submission_test, threshold=0.8)

## [범주형 변수 => 수치형 변수]

In [16]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series


# 레이블 인코딩할 칼럼들
label_columns = [
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    'country_level1', 'country_level2', 'country_level3',
    "customer_position",
    "response_corporate",
    "expected_timeline",
    "customer_country",
    "sentimental"
]

df_all = pd.concat([data[label_columns], submission_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    data[col] = df_all.iloc[: len(data)][col]
    submission_test[col] = df_all.iloc[len(submission_test) :][col]


## [IterativeImputation 진행]

In [17]:
data_x = data.drop("is_converted",axis=1)
submission_x = submission_test.drop(["is_converted","id"],axis=1)
labels_array = np.array(data_x.columns)
imputer_mice = IterativeImputer(random_state=83)
data_tmp = imputer_mice.fit_transform(data_x)
submission_test_tmp = imputer_mice.fit_transform(submission_x)

In [18]:
data_2 = pd.DataFrame(data_tmp, columns=labels_array)
submission_test_2 = pd.DataFrame(submission_test_tmp, columns=labels_array)

data_2['is_converted']=data['is_converted']
submission_test_2['is_converted']=submission_test['is_converted']

## [RandomUnderSampling 진행]

In [19]:
X =data_2.drop(columns=['is_converted'])
y = data_2['is_converted']
X, y = RandomUnderSampler(random_state=2021).fit_resample(X, y)

## [MinMax Scaling 진행]

In [20]:
scale = MinMaxScaler()
X = scale.fit_transform(X)
test = submission_test_2.drop(["is_converted"],axis=1)
test = scale.transform(test)


# 3. 모델 훈련

## [모델 선언]

In [21]:
rf = RandomForestClassifier(random_state = 42
                         , n_estimators = 305
                         , criterion = 'gini'
                         , max_depth = 62
                         , min_samples_split = 7
                         , min_samples_leaf = 1)
dt = DecisionTreeClassifier(random_state = 42
                         , criterion = 'entropy'
                         , max_depth = 25
                         , min_samples_split = 2
                         , min_samples_leaf = 1)
et = ExtraTreesClassifier(random_state = 42
                         , n_estimators = 930
                         , criterion = 'entropy'
                         , max_depth = 65
                         , min_samples_split = 6
                         , min_samples_leaf = 1
                         )
xgb = XGBClassifier(random_state = 42
                   , n_estimators = 665
                   , reg_lambda = 0.04614513317156364
                   , reg_alpha = 0.8831857977740336
                   , tree_method = "exact"
                   , colsample_bytree = 0.7664006730032823
                   , subsample = 0.6579847353498132
                   , learning_rate = 0.4046062291148477
                   , max_depth = 64
                   , min_child_weight = 2
                   )
lgbm = LGBMClassifier(random_state=42)
params = {
          'learning_rate': 0.4,
          'eval_metric':'F1',
          'early_stopping_rounds':50,
          'random_seed': 42,
          'verbose':200}
cat = CatBoostClassifier(**params)

## [모델 Stacking]

In [22]:
estimators = [('et',et), ('xgb',xgb), ('dt',dt), ('rf',rf),('cat',cat),('lgbm',lgbm)]
stack = StackingClassifier(estimators, final_estimator=LogisticRegression(), verbose=1)
stack.fit(X,y)


0:	learn: 0.8467989	total: 51ms	remaining: 51s
200:	learn: 0.9978366	total: 813ms	remaining: 3.23s
400:	learn: 0.9993817	total: 1.55s	remaining: 2.32s
600:	learn: 0.9993818	total: 2.23s	remaining: 1.48s
800:	learn: 0.9993818	total: 2.88s	remaining: 715ms
999:	learn: 0.9993818	total: 3.53s	remaining: 0us
[LightGBM] [Info] Number of positive: 4850, number of negative: 4850
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3483
[LightGBM] [Info] Number of data points in the train set: 9700, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
0:	learn: 0.8421602	total: 4.65ms	remaining: 4.64s
200:	learn: 0.9989691	total: 692ms	remaining: 2.75s
400:	learn: 0.9996136	total: 1.35s	remaining: 2.02s
600:	learn: 0.9996136	total: 1.95s	remai

# 4. 모델 예측

In [23]:
pred = stack.predict(test)
my_series = pd.Series(pred)
value_counts_result = my_series.value_counts()
value_counts_result

False    2822
True     2449
dtype: int64

# 5. 결과 저장

In [24]:
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = pred
# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)