# Admission 예제

In [13]:
# Logistic Regression에 대해서도 같은 과정으로
# Admission 예제를 이용해서 구현해 보세요!

# 1. Seed 처리(42)
# 2. 데이터 처리
#    기본적인 데이터 처리
#    - 결측치, 이상치, 데이터분리, 정규화, 범주처리
#      추가적으로 데이터 불균형처리(방법을 찾아서 해결)
# 3. Model 구현(sklearn, TensorFlow, PyTorch)
# 4. 성능평가 => accuracy, F1 score
#    => accuracy => 0.75 , F1 score => 0.44 정도가 나와요!
#
# ===> moon9342@gmail.com 로 제출해주세요!
# 메일로 구현 내용을 제출 => ipynb을 제출!
# 메일안에 간단하게 accuracy가 얼마, F1이 얼마인지 포함해서 제출

In [14]:
# 필요한 module import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# sklearn
from sklearn import linear_model  # LinearRegression 모델이 이 안에 들어 있어요!
from sklearn.preprocessing import StandardScaler # 정규화(Normalization) - 표준화
from sklearn.model_selection import train_test_split # 데이터 분리

# scipy
from scipy import stats  # zscore()가 제공되요! -> 이상치 처리할때 사용할꺼예요!

# TensorFlow
import tensorflow as tf   # random seed 때문에 필요!
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim

# Random Seed를 위해서 포함
import os
import random

In [15]:
def seed_everything(seed=42):

    # 1. 환경변수
    os.environ['PYTHONHASHSEED'] = str(seed)

    # 2. Python, Numpy
    random.seed(seed)
    np.random.seed(seed)

    # 3. TensorFlow
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    tf.random.set_seed(seed)

    # 4. PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)  # PyTorch 1.8+ 권장

    print(f"Seeds set to {seed}")

seed_everything(42)

Seeds set to 42


## 데이터 전처리

In [28]:
# csv데이터 로딩
df = pd.read_csv('./data/admission.csv')

# data 결측치 확인
print(df.isnull().sum())

# 이상치 확인
zscore_threshold = 2.0
mask = np.abs(stats.zscore(df)) <= zscore_threshold
train_data_set = df.loc[mask]
train_data_set

# 범주처리
df['rank'] = 5 - df['rank']

# 데이터 분리
x_data = train_data_set.drop('admit', axis=1).values
y_data = train_data_set['admit'].values.reshape(-1,1)

x_train, x_val, y_train, y_val = train_test_split(x_data,y_data,test_size=0.3,random_state=42)

# 정규화
scaler_x = StandardScaler()

scaler_x.fit(x_train)

x_train_norm = scaler_x.transform(x_train)
x_val_norm = scaler_x.transform(x_val)

y_train_norm = y_train
y_val_norm = y_val

# 불균형 처리
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_train_res, y_train_res = smote.fit_resample(x_train_norm, y_train_norm)

admit    0
gre      0
gpa      0
rank     0
dtype: int64


## sklearn 구현

In [34]:
# 모델 생성
sklearn_model = linear_model.LogisticRegression(max_iter=3000)

# 모델 학습
sklearn_model.fit(x_train_res,
                  y_train_res)

# 예측
sklearn_y_pred = sklearn_model.predict(x_val_norm)
print(sklearn_y_pred)

[1 1 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 0 1 1 0 1 1
 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 1 0 0 0 0 1
 0 0 1 0 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 1
 1 1 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1
 1 1 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 0 0 0 1 1 1 1 1
 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 1
 1 0 1 1 1 1 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 1 1 0
 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1
 0 0 0 0 1 0 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1
 1 1 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 0 0
 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 1 0 1 1
 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 0 0 0 1
 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 0 0 0 0]


In [36]:
# 성능평가
from sklearn.metrics import accuracy_score, f1_score

acc_sklearn = accuracy_score(y_val, sklearn_y_pred)
f1_sklearn = f1_score(y_val, sklearn_y_pred)

print(f"Accuracy: {acc_sklearn:.2f}")
print(f"F1-score: {f1_sklearn:.2f}")

'''
SMOTE 사용 X
Accuracy: 0.71
F1-score: 0.37
---------------
SMOTE 사용 O
Accuracy: 0.65
F1-score: 0.56
'''

Accuracy: 0.65
F1-score: 0.56


'\nSMOTE 사용 X\nAccuracy: 0.71\nF1-score: 0.37\n---------------\nSMOTE 사용 O\nAccuracy: 0.65\nF1-score: 0.56\n'

## Tensorflow

In [31]:
# 모델 생성
keras_model = Sequential()

# Layer 생성 모델 추가
keras_model.add(Input(shape=(3,)))
keras_model.add(Dense(units=1, activation='sigmoid'))

# 모델 compile & 설정
keras_model.compile(optimizer=Adam(learning_rate=1e-3),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

# 부가기능
log_dir = './logs/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tb_cb = TensorBoard(log_dir=log_dir,
                    histogram_freq=1)

# Model 학습
keras_model.fit(x_train_res,
                y_train_res,
                epochs=1000,
                callbacks=[tb_cb],
                verbose=0,
                validation_data=(x_val_norm, y_val))

# 모델 확인
keras_model.summary()

In [37]:
# 예측
keras_y_pred = keras_model.predict(x_val_norm)

keras_y_pred_label = (keras_y_pred > 0.5).astype(int)

print(keras_y_pred_label)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[[1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 

In [None]:
# 성능평가
from sklearn.metrics import accuracy_score, f1_score, classification_report

acc_keras = accuracy_score(y_val, keras_y_pred_label)
f1_keras = f1_score(y_val, keras_y_pred_label)

print(f"Accuracy: {acc_keras:.2f}")
print(f"F1-score: {f1_keras:.2f}")

'''
SMOTE 사용 X
Accuracy: 0.71
F1-score: 0.37
---------------
SMOTE 사용 O
Accuracy: 0.65
F1-score: 0.56
'''

Accuracy: 0.65
F1-score: 0.56


## PyTorch

In [40]:
# CPU 사용
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tensor 생성
x_tensor_train = torch.FloatTensor(x_train_res).to(device)
y_tensor_train = torch.FloatTensor(y_train_res).to(device)
y_tensor_train = y_tensor_train.view(-1, 1)

x_tensor_valid = torch.FloatTensor(x_val_norm).to(device)
y_tensor_valid = torch.FloatTensor(y_val_norm).to(device)

# class 정의
class LogisticRegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(3,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.linear(x))
    
# 모델 생성
torch_model = LogisticRegressionModel().to(device)

# 손실함수
criterion = nn.BCELoss()
optimizer = optim.Adam(torch_model.parameters(), lr=1e-3)

# 학습진행
epochs=20000

for epoch in range(epochs):
    # 1. 예측값 도출
    y_pred = torch_model(x_tensor_train)
    # 2. Loss 계산
    loss = criterion(y_pred, y_tensor_train)
    # 3. 가중치 계산
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [41]:
# 예측
with torch.no_grad():
    torch_y_pred = torch_model(x_tensor_valid)
    torch_y_pred_label = (torch_y_pred > 0.5).float()


print(torch_y_pred_label)

tensor([[1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
      

In [None]:
# 성능평가
# tensor → numpy 변환
y_true = y_tensor_valid.cpu().numpy()
y_pred = torch_y_pred_label.cpu().numpy()

# 평가
acc_torch = accuracy_score(y_true, y_pred)
f1_torch = f1_score(y_true, y_pred)

print(f"Accuracy: {acc_torch:.2f}")
print(f"F1-score: {f1_torch:.2f}")

'''
SMOTE 사용 X
Accuracy: 0.71
F1-score: 0.37
---------------
SMOTE 사용 O
Accuracy: 0.65
F1-score: 0.56
'''

Accuracy: 0.65
F1-score: 0.56
