In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

### 데이터 불러오기

In [2]:
x = pd.read_csv('./data/X_train.csv', encoding='euc-kr')
y = pd.read_csv('./data/y_train.csv', encoding='euc-kr')
validation = pd.read_csv('./data/X_test.csv', encoding='euc-kr')

In [3]:
#X_train.info()

### 결측치 처리

In [4]:
x.isnull().sum()

cust_id       0
총구매액          0
최대구매액         0
환불금액       2295
주구매상품         0
주구매지점         0
내점일수          0
내점당구매건수       0
주말방문비율        0
구매주기          0
dtype: int64

In [6]:
x['환불금액'] = x['환불금액'].fillna(0)
validation['환불금액'] = validation['환불금액'].fillna(0)
x = x.drop('cust_id', axis=1)
validation = validation.drop('cust_id', axis=1)
y = y['gender']

### 명목형 데이터 수정

In [7]:
category = np.union1d(x['주구매상품'].unique(), validation['주구매상품'].unique())
store = np.union1d(x['주구매지점'].unique(), validation['주구매지점'].unique())

In [8]:
x['주구매상품'] = x['주구매상품'].apply(lambda x: np.where(category == x)[0][0])
validation['주구매상품'] = validation['주구매상품'].apply(lambda x: np.where(category == x)[0][0])

x['주구매지점'] = x['주구매지점'].apply(lambda x: np.where(store == x)[0][0])
validation['주구매지점'] = validation['주구매지점'].apply(lambda x: np.where(store == x)[0][0])

### 정규화

In [9]:
sc = StandardScaler()
sc.fit(x)
x_sc = sc.transform(x)
val_sc = sc.transform(validation)

### 데이터 분할

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x_sc, y, test_size=0.3, random_state=60, stratify=y)

### 모델학습

In [15]:
lr = LogisticRegression().fit(X_train, y_train)
probs_lr = lr.predict_proba(X_test)[:,1]
roc_auc_score(y_test, probs_lr)

0.6311759590298579

### 검증

In [20]:
result = pd.concat([validation['cust_id'], pd.DataFrame(lr.predict_proba(val_sc)[:, 1])], axis=1)
result.columns = ['cust_id','gener']

In [21]:
result.to_csv('20211026.csv', index = False)