In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,f1_score
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [3]:
train.columns

Index(['ID', '가입일', '음성사서함이용', '주간통화시간', '주간통화횟수', '주간통화요금', '저녁통화시간',
       '저녁통화횟수', '저녁통화요금', '밤통화시간', '밤통화횟수', '밤통화요금', '상담전화건수', '전화해지여부'],
      dtype='object')

In [4]:
train.set_index('ID', inplace=True)
test.set_index('ID', inplace=True)

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12943 entries, TEST_00000 to TEST_12942
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   가입일      12943 non-null  int64  
 1   음성사서함이용  12943 non-null  int64  
 2   주간통화시간   12943 non-null  float64
 3   주간통화횟수   12943 non-null  int64  
 4   주간통화요금   12943 non-null  float64
 5   저녁통화시간   12943 non-null  float64
 6   저녁통화횟수   12943 non-null  int64  
 7   저녁통화요금   12943 non-null  float64
 8   밤통화시간    12943 non-null  float64
 9   밤통화횟수    12943 non-null  int64  
 10  밤통화요금    12943 non-null  float64
 11  상담전화건수   12943 non-null  int64  
dtypes: float64(6), int64(6)
memory usage: 1.3+ MB


In [6]:
feat = [
    '가입일', '음성사서함이용', '주간통화시간', '주간통화횟수', 
    '주간통화요금', '저녁통화시간', '저녁통화횟수', '저녁통화요금', 
    '밤통화시간', '밤통화횟수', '밤통화요금', '상담전화건수'
]
label = '전화해지여부'
X_train, X_test, y_train, y_test = train_test_split(train[feat], 
                                                    train[label], 
                                                    test_size=0.3, 
                                                    random_state=42)

## 훈련/검증 데이터 분할

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((21140, 12), (9060, 12), (21140,), (9060,))

## 데이터 표준화

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 모델구성

In [9]:
model = Sequential()
model.add(Dense(64, input_dim=12, activation='relu'))
# model.add(Dense(256, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

## 모델 컴파일

In [10]:
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1_score])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## 조기 종료 콜백

In [11]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')

## 클래스 가중치 설정

In [12]:
class_weight = {0: 1, 1: 1}

## 모델 학습

In [32]:
model.fit(X_train, y_train, 
          epochs=100, batch_size=16, 
          validation_data=(X_test, y_test),
          callbacks=[early_stopping],
#           class_weight=class_weight
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


<keras.callbacks.History at 0x29039330788>

In [33]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Loss:', loss)
print('Accuracy:', accuracy)

Loss: 0.3000923693180084
Accuracy: 0.8900662064552307


In [34]:
y_pred = model.predict(X_test)



In [35]:
print(classification_report(y_test, (y_pred > 0.6)))

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      8117
           1       0.50      0.22      0.31       943

    accuracy                           0.90      9060
   macro avg       0.71      0.60      0.63      9060
weighted avg       0.87      0.90      0.88      9060



In [165]:
pred.sum()

1910.1208

## 예측

In [151]:
pred = model.predict(test)



In [156]:
submission[label] = np.int8(pred > 0.6)

In [93]:
submission

Unnamed: 0,ID,전화해지여부
0,TEST_00000,0
1,TEST_00001,0
2,TEST_00002,0
3,TEST_00003,0
4,TEST_00004,0
...,...,...
12938,TEST_12938,0
12939,TEST_12939,0
12940,TEST_12940,0
12941,TEST_12941,0


In [158]:
submission.to_csv('data/submission_model01.csv', index=False)

In [157]:
submission[label].value_counts()

0    11042
1     1901
Name: 전화해지여부, dtype: int64