In [2]:
import pandas as pd
from jinja2 import optimizer

df = pd.read_csv("user_fake_authentic_2class.csv")

# 'class' 열의 문자열 값을 숫자로 변환 (예: 'f' -> 0, 'r' -> 1)
df['class'] = df['class'].apply(lambda x: 0 if x == 'f' else 1)

display(df.head())
#사이킬런 파이썬 라이브러리
#머신러닝 관련 lib. 여기에 kford: t, v , t 데이터 나눠주는게 있음
#판다스로 로드에서 받은거 알려줌.
#레이블 비율 똑같이

Unnamed: 0,pos,flw,flg,bl,pic,lin,cl,cz,ni,erl,erc,lt,hc,pr,fo,cs,pi,class
0,44,48,325,33,1,0,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.094985,0
1,10,66,321,150,1,0,213,0.0,1.0,14.39,1.97,0.0,1.5,0.0,0.0,0.206826,230.412857,0
2,33,970,308,101,1,1,436,0.0,1.0,10.1,0.3,0.0,2.5,0.0,0.056,0.572174,43.569939,0
3,70,86,360,14,1,0,0,1.0,0.0,0.78,0.06,0.0,0.0,0.0,0.0,1.0,5.859799,0
4,3,21,285,73,1,0,93,0.0,0.0,14.29,0.0,0.667,0.0,0.0,0.0,0.300494,0.126019,0


In [3]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

train_valid, test = train_test_split(df, test_size=0.2,stratify=df['class'] ,random_state=42)

train, valid = train_test_split(train_valid, test_size=0.25, stratify=train_valid['class'], random_state=42)

# 각 데이터 세트의 class 비율 확인
print("Train class distribution:")
print(train['class'].value_counts(normalize=True))

print("\nValidation class distribution:")
print(valid['class'].value_counts(normalize=True))

print("\nTest class distribution:")
print(test['class'].value_counts(normalize=True))

# Y is target variable.

# Train data
X_train, y_train = train.drop(columns=['class']).values, train['class'].values
# Valid data
X_valid, y_valid = valid.drop(columns=['class']).values, valid['class'].values
# Test data
X_test, y_test = test.drop(columns=['class']).values, test['class'].values

# Numpy 배열을 PyTorch 텐서로 변환 for DL input
X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)
X_valid, y_valid = torch.tensor(X_valid, dtype=torch.float32), torch.tensor(y_valid, dtype=torch.long)
X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long)

# DataLoader 생성
train_dataset = TensorDataset(X_train, y_train)
valid_dataset = TensorDataset(X_valid, y_valid)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Train class distribution:
class
0    0.5031
1    0.4969
Name: proportion, dtype: float64

Validation class distribution:
class
0    0.5031
1    0.4969
Name: proportion, dtype: float64

Test class distribution:
class
0    0.503138
1    0.496862
Name: proportion, dtype: float64


In [4]:
import torch.nn as nn

class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleMLP, self).__init__()
        # 6개의 hidden layer
        self.fc1 = nn.Linear(17, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 2)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    
# 파라미터 설정
input_size = X_train.shape[1] #17
hidden_size = input_size-1 #TODO : 적절한 히든레이어 사이즈 경험적으로 만들기
num_classes = len(set(y_train.numpy())) # 2

print(input_size, hidden_size, num_classes)
learning_rate = 0.001

# 모델 초기화
model = SimpleMLP(input_size, hidden_size,num_classes) #



17 16 2


In [5]:
# 손실 함수와 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 학습 설정
num_epochs = 50

# 모델 학습
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # 순전파
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # 역전파 및 옵티마이저 단계
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    # 검증 세트에서 모델 평가
    model.eval()
    valid_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], '
          f'Train Loss: {running_loss/len(train_loader):.4f}, '
          f'Validation Loss: {valid_loss/len(valid_loader):.4f}, '
          f'Validation Accuracy: {correct/total:.4f}')
          


Epoch [1/50], Train Loss: 24.5750, Validation Loss: 17.0286, Validation Accuracy: 0.6909
Epoch [2/50], Train Loss: 8.0925, Validation Loss: 16.1712, Validation Accuracy: 0.7620
Epoch [3/50], Train Loss: 6.6311, Validation Loss: 9.7714, Validation Accuracy: 0.7634
Epoch [4/50], Train Loss: 4.9502, Validation Loss: 5.6504, Validation Accuracy: 0.7463
Epoch [5/50], Train Loss: 3.9413, Validation Loss: 4.2022, Validation Accuracy: 0.7815
Epoch [6/50], Train Loss: 2.6904, Validation Loss: 5.0573, Validation Accuracy: 0.7707
Epoch [7/50], Train Loss: 9.8639, Validation Loss: 49.2677, Validation Accuracy: 0.7592
Epoch [8/50], Train Loss: 4.2419, Validation Loss: 5.7025, Validation Accuracy: 0.7932
Epoch [9/50], Train Loss: 2.4583, Validation Loss: 7.8003, Validation Accuracy: 0.7276
Epoch [10/50], Train Loss: 2.5662, Validation Loss: 6.7104, Validation Accuracy: 0.6981
Epoch [11/50], Train Loss: 3.1271, Validation Loss: 6.9157, Validation Accuracy: 0.8036
Epoch [12/50], Train Loss: 11.0272, V

In [1]:
# 모델 평가
model.eval()
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Loss: {test_loss/len(test_loader):.4f}')
print(f'Test Accuracy: {correct/total:.4f}')

NameError: name 'model' is not defined