In [1]:
!pip install category-encoders



# 1. Library
패키지 import 및 warning 무시

In [2]:
import os
import random
import numpy as np
import pandas as pd
from category_encoders.target_encoder import TargetEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import warnings
warnings.filterwarnings('ignore')

# 2. Data
seed 설정 및 데이터 로드

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

# 데이터 불러오기
train_org = pd.read_csv('train.csv')
test_org = pd.read_csv('test.csv')
test_org['target'] = np.NaN

# 데이터 전처리
train_df = train_org.copy()
test_df = test_org.copy()

In [4]:
train_df

Unnamed: 0,user_id,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,subscription_type,customer_inquiry_history,payment_pattern,target
0,b919c29d,13,14,14.946163,8.427187,18,16,68.360455,3,4,Low,Basic,4,5,0
1,a0a60abb,16,18,18.453224,72.646087,16,13,97.567322,2,3,Medium,Basic,1,6,1
2,b9f171ae,22,1,16.195228,21.774492,13,14,94.358763,3,4,Medium,Premium,0,7,1
3,5dc0ba8b,1,19,17.628656,42.659066,19,18,70.153228,0,3,Low,Basic,1,0,1
4,65c83654,4,5,21.390656,30.744287,19,10,81.917908,2,4,Medium,Basic,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ae6b76bc,22,29,14.727623,84.053558,18,16,64.966803,2,5,Low,Premium,1,1,1
9996,24588752,10,11,19.374054,45.464833,9,8,82.750244,3,3,Medium,Basic,2,7,1
9997,e4622a54,7,27,18.240978,127.302411,24,14,81.567839,3,5,High,Basic,1,6,1
9998,e07fbad9,11,7,18.783800,5.297234,10,10,89.885656,4,5,Low,Basic,2,0,1


Index(['subscription_duration', 'recent_login_time', 'average_login_time',
       'average_time_per_learning_session', 'monthly_active_learning_days',
       'total_completed_courses', 'recent_learning_achievement',
       'abandoned_learning_sessions', 'community_engagement_level',
       'customer_inquiry_history', 'target'],
      dtype='object')

In [14]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows = 4, cols = 3,
                    subplot_titles = ("서비스 가입 기간","마지막으로 로그인한 시간","일반적인 로그인 시간",
                                      "학습 세션에 소요된 평균 시간","월간 학습 일수","완료한 총 코스 수",
                                      "최근 학습 성취도","중단된 학습 세션 수","커뮤니티 참여도",
                                      "고객 문의 이력"))
cols = train_df.drop(columns='payment_pattern').select_dtypes(exclude='object').columns
# Box Plot 추가
for i, col in enumerate(cols):
    row_num = i // 3 + 1
    col_num = i % 3 + 1

    box_plot = go.Box(y = train_df[col], name = f"{col} (target=1)")

    # 서브플롯에 Box Plot 추가
    fig.add_trace(box_plot, row = row_num, col = col_num)

# 레이아웃 업데이트
fig.update_layout(height = 800, showlegend=False)

# 그래프 표시
fig.show()


#### 파생변수를 생성해봤지만 성능이 떨어져서 주석 처리
시도한 파생변수 : "월간 로그인 횟수", "월간 학습 일수 * 평균 시간", "월당 학습시간"

In [6]:
############################################################################################################################################
################################################################파생변수 생성################################################################
# train_df['login_frequency'] = train_df['subscription_duration'] / train_df['monthly_active_learning_days']                      # 월간 로그인 횟수
# train_df['total_learning_time'] = train_df['average_time_per_learning_session'] * train_df['monthly_active_learning_days']      # 월간학습일수 * 평균 시간
# train_df['average_courses_completed_per_month'] = train_df['total_completed_courses'] / train_df['subscription_duration']       # 월당 학습시간
#
# test_df['login_frequency'] = test_df['subscription_duration'] / test_df['monthly_active_learning_days']                         # 월간 로그인 횟수
# test_df['total_learning_time'] = test_df['average_time_per_learning_session'] * test_df['monthly_active_learning_days']         # 월간학습일수 * 평균 시간
# test_df['average_courses_completed_per_month'] = test_df['total_completed_courses'] / test_df['subscription_duration']          # 월당 학습시간

# train_df.drop(columns=['subscription_duration', 'monthly_active_learning_days',
#                        'average_time_per_learning_session', 'total_completed_courses'], inplace=True)
# test_df.drop(columns=['subscription_duration', 'monthly_active_learning_days',
#                        'average_time_per_learning_session', 'total_completed_courses'], inplace=True)
############################################################################################################################################
############################################################################################################################################

# 3. Preprocess
Target Encoding 및 결측치 처리

In [7]:
categorical_features = list(train_df.dtypes[train_df.dtypes == "object"].index)
# Target Encoding
for i in categorical_features:
    le = TargetEncoder(cols=[i])
    train_df[i] = le.fit_transform(train_df[i], train_df['target'])
    test_df[i] = le.transform(test_df[i], test_df['target'])

# 결측치 처리
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

데이터 셋 및 데이터 로더 생성

In [8]:
# 데이터셋 및 데이터로더 생성
class CustomDataset(Dataset):
    def __init__(self, dataframe, target_column='target', transform=None):
        self.dataframe = dataframe
        self.target_column = target_column
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        features = torch.tensor(self.dataframe.iloc[idx].drop(columns=[self.target_column]).values, dtype=torch.float32)
        target = torch.tensor(self.dataframe.iloc[idx][self.target_column], dtype=torch.float32)

        if self.transform:
            features = self.transform(features)

        return {'features': features, 'target': target}

train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=10000, num_workers = 4, shuffle=True, pin_memory = True)
test_loader = DataLoader(test_dataset, batch_size=10000, num_workers = 4, shuffle=False, pin_memory = True)

# 4. Modeling
#### 모델 구성
선형 레이어 5개, 출력 레이어 2개로 구성, sigmoid 함수를 사용해서 결괏값은 0~1 사이의 값을 가짐.
손실 함수는 일반적으로 쓰이는 이진 분류 손실 함수인 BCEWithLoss() 써보았지만, 성능이 떨어짐
따라서 MSELoss()를 채택하게 되었는데, 선택해도 되는 이유는 출력 텐서의 형식이 proba 처럼 나오기
때문에 써도 될 거 같다고 생각함.


In [9]:
class CustomModel(nn.Module):
    def __init__(self, input_size):
        super(CustomModel, self).__init__()

        self.fc1 = nn.Linear(input_size, 16)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(16, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 64)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(64, 128)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(128, 256)
        self.relu5 = nn.ReLU()
        self.output_layer1 = nn.Linear(256, 64)
        self.output_layer2 = nn.Linear(64, 1)

    def forward(self, x):
       # x = self.batch_norm(x)
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.relu4(self.fc4(x))
        x = self.relu5(self.fc5(x))
        x = self.output_layer1(x)
        x = self.output_layer2(x)
        x = torch.sigmoid(x)  # Sigmoid 활성화 함수 추가
        return x

input_size = len(train_df.columns)  # 입력 피처의 개수
model = CustomModel(input_size)
# criterion = nn.BCEWithLogitsLoss() # 일반적으로 쓰이는 이진 분류 손실 함수 : BCEWithLoss()
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

#### 모델 생성
epoch은 가장 잘 나온 100으로 설정.
본 컴퓨터에 GPU가 없어서 CPU 사용, 한 번 반복할 때 마다 반복 횟수와 Loss 출력


In [10]:
if __name__ == '__main__':
    # 학습
    num_epochs = 100
    print_interval = 10
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            features, target = batch['features'], batch['target']

            optimizer.zero_grad()
            output = model(features)
            loss = criterion(output, target.view(-1, 1).float())  # MSELoss에 대한 타입 호환성 수정
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}')

    # 예측
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch in test_loader:
            features, _ = batch['features'], batch['target']  # test 데이터셋에는 target이 필요하지 않음
            output = model(features)
            binary_output = torch.round(output)
            all_predictions.append(binary_output.cpu().numpy())

Epoch 1/100, Train Loss: 0.2595924437046051
Epoch 2/100, Train Loss: 0.24651943147182465
Epoch 3/100, Train Loss: 0.23791223764419556
Epoch 4/100, Train Loss: 0.23323020339012146
Epoch 5/100, Train Loss: 0.23186364769935608
Epoch 6/100, Train Loss: 0.23281729221343994
Epoch 7/100, Train Loss: 0.23444369435310364
Epoch 8/100, Train Loss: 0.2354014664888382
Epoch 9/100, Train Loss: 0.23540471494197845
Epoch 10/100, Train Loss: 0.23472966253757477
Epoch 11/100, Train Loss: 0.23376765847206116
Epoch 12/100, Train Loss: 0.23283787071704865
Epoch 13/100, Train Loss: 0.23213432729244232
Epoch 14/100, Train Loss: 0.23172666132450104
Epoch 15/100, Train Loss: 0.2315901815891266
Epoch 16/100, Train Loss: 0.2316453605890274
Epoch 17/100, Train Loss: 0.23179389536380768
Epoch 18/100, Train Loss: 0.23195186257362366
Epoch 19/100, Train Loss: 0.2320619374513626
Epoch 20/100, Train Loss: 0.23209311068058014
Epoch 21/100, Train Loss: 0.2320394068956375
Epoch 22/100, Train Loss: 0.23191103339195251
Epo

# 5. Result

In [11]:
    # test 데이터셋에 대한 예측 결과를 하나의 배열로 통합
    all_predictions = np.concatenate(all_predictions)

    # 결과를 DataFrame으로 만들어 CSV 파일로 저장
    submission = pd.read_csv('sample_submission.csv')
    submission['target'] = all_predictions.astype(int)  # 정수형으로 변환하여 저장
    # submission.to_csv(f'submission_torchMLP_1204.csv', index=False)
    print(submission)

       user_id  target
0     0001d6e9       1
1     0002c77d       1
2     0002df5b       0
3     000b6068       1
4     00184a0c       0
...        ...     ...
9995  ffe2eba5       1
9996  ffe710f1       1
9997  ffeccdef       1
9998  fff3fcea       1
9999  fff4b04b       0

[10000 rows x 2 columns]
