In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier

### 데이터 불러오기

In [2]:
x = pd.read_csv('./data/X_train.csv', encoding='euc-kr')
y = pd.read_csv('./data/y_train.csv', encoding='euc-kr')
validation = pd.read_csv('./data/X_test.csv', encoding='euc-kr')

In [3]:
#X_train.info()

### 결측치 처리

In [4]:
# x.isnull().sum()

In [5]:
x['환불금액'] = x['환불금액'].fillna(0)
validation['환불금액'] = validation['환불금액'].fillna(0)
x = x.drop('cust_id', axis=1)
validation_id = validation['cust_id']
validation = validation.drop('cust_id', axis=1)
y = y['gender']

### 명목형 데이터 수정

In [6]:
x[['주구매상품','주구매지점']] = x[['주구매상품','주구매지점']].apply(LabelEncoder().fit_transform)
validation[['주구매상품','주구매지점']] = validation[['주구매상품','주구매지점']].apply(LabelEncoder().fit_transform)

### 정규화

In [7]:
sc = StandardScaler()
x_sc = sc.fit_transform(x)
val_sc = sc.fit_transform(validation)

### 데이터 분할

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x_sc, y, test_size=0.3, random_state=60, stratify=y)

### 모델학습

In [9]:
lr = LogisticRegression().fit(X_train, y_train)

### 검증

In [10]:
probs_lr = lr.predict_proba(X_test)[:,1]
roc_auc_score(y_test, probs_lr)

0.6311759590298579

### 결과 저장

In [11]:
result = pd.concat([validation_id, pd.DataFrame(lr.predict_proba(val_sc)[:, 1])], axis=1)
result.columns = ['cust_id','gener']

In [12]:
result.head(5)

Unnamed: 0,cust_id,gener
0,3500,0.499603
1,3501,0.164047
2,3502,0.118909
3,3503,0.351902
4,3504,0.405498


In [13]:
result.to_csv('20211026.csv', index = False)