# All in one

### Import

In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import copy
import os
import random

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, confusion_matrix

### Train_Val_Test split

In [None]:
data = pd.read_csv("./preprocess_data.csv") 
quality = pd.read_csv("./preprocess_quality.csv")

In [None]:
train_qual, val_test_qual, _, _ = train_test_split(quality,quality,test_size=0.4,stratify=quality['불량단계'], random_state=7)
val_qual, test_qual, _, _ = train_test_split(val_test_qual,val_test_qual,test_size=0.5,stratify=val_test_qual['불량단계'], random_state=19)
quality = pd.concat([train_qual,val_qual,test_qual]).reset_index(drop=True)


### PCA

In [None]:
total_len = len(data.columns[1:])
selected_len = 19
pca = PCA(n_components=selected_len)

train_data = pd.DataFrame([], columns=data.columns)
for i in train_qual['배정번호']:
    train_data = pd.concat([train_data, data[data['배정번호']==i]]).reset_index(drop=True)

pca.fit(train_data.loc[:,data.columns[1:]])
data.loc[:,data.columns[1:1+selected_len]] = pca.transform(data.loc[:,data.columns[1:]])
data = data.iloc[:,:]

In [None]:
batch_quality = quality.iloc[:-len(quality)*2//5]
for cluster_idx in tqdm(batch_quality['배정번호']):
    exec(f"torch_data{cluster_idx} = torch.tensor(np.array(data.loc[data['배정번호']==cluster_idx, data.columns[1:1+selected_len]]))")
    exec(f"num{cluster_idx} = batch_quality[batch_quality['배정번호']==cluster_idx]['위험군개수'].iloc[0]")
    exec(f"torch_data{cluster_idx} = torch_data{cluster_idx}.type(torch.float32)")
        

batch_quality_val = quality.iloc[-len(quality)*2//5:-len(quality)//5]
for cluster_idx in tqdm(batch_quality_val['배정번호']):
    exec(f"torch_data{cluster_idx} = torch.tensor(np.array(data.loc[data['배정번호']==cluster_idx, data.columns[1:1+selected_len]]))")
    exec(f"num{cluster_idx} = batch_quality_val[batch_quality_val['배정번호']==cluster_idx]['위험군개수'].iloc[0]")
    exec(f"torch_data{cluster_idx} = torch_data{cluster_idx}.type(torch.float32)")


batch_quality_test = quality.iloc[-len(quality)//5:]
for cluster_idx in tqdm(batch_quality_test['배정번호']):
    exec(f"torch_data{cluster_idx} = torch.tensor(np.array(data.loc[data['배정번호']==cluster_idx, data.columns[1:1+selected_len]]))")
    exec(f"num{cluster_idx} = batch_quality_test[batch_quality_test['배정번호']==cluster_idx]['위험군개수'].iloc[0]")
    exec(f"torch_data{cluster_idx} = torch_data{cluster_idx}.type(torch.float32)")

### Loss Function

In [None]:
hidden_num = 64

def cohesion(group1_data):
    return torch.cdist(group1_data, group1_data.mean(0).reshape(-1,hidden_num)).mean() 

def classify_loss(group1_logit_data, group2_logit_data, num):
    accuracy_penalty = 1
    if (num/(len(group1_logit_data) + len(group2_logit_data)) >= 0.0004525*10) and (len(group2_logit_data)/(len(group1_logit_data) + len(group2_logit_data)) < 0.0004525*10):
        accuracy_penalty = 20 
    elif (num/(len(group1_logit_data) + len(group2_logit_data)) < 0.0004525*10) and (len(group2_logit_data)/(len(group1_logit_data) + len(group2_logit_data)) >= 0.0004525*10):
        accuracy_penalty = 10 
    else:
        accuracy_penalty = 1
      

    if len(group2_logit_data) >= num:
        values, indices = torch.sort(group1_logit_data)
        group1_values = values
        values, indices = torch.sort(group2_logit_data)
        group2_values = values

        if len(group1_values) and len(group2_values):
            return (-torch.log(group1_values).mean() + torch.log(1 - group2_values).mean())*(torch.abs(torch.tensor((len(group2_logit_data) - num)/(len(group1_logit_data)+len(group2_logit_data)))))*accuracy_penalty
        elif not len(group1_values):
            return torch.log(1 - group2_values).mean()*(torch.abs(torch.tensor((len(group2_logit_data) - num)/(len(group1_logit_data)+len(group2_logit_data)))))*accuracy_penalty 
        else:
            return -torch.log(group1_values).mean()*(torch.abs(torch.tensor((len(group2_logit_data) - num)/(len(group1_logit_data)+len(group2_logit_data)))))*accuracy_penalty
    
    else:
        values, indices = torch.sort(group1_logit_data)
        group1_values = values
        values, indices = torch.sort(group2_logit_data)
        group2_values = values
        if len(group1_values) and len(group2_values):
            return (torch.log(group1_values).mean() - torch.log(1 - group2_values).mean())*(torch.abs(torch.tensor((len(group2_logit_data) - num)/(len(group1_logit_data)+len(group2_logit_data)))))*accuracy_penalty 
        elif not len(group1_values):
            return - torch.log(1 - group2_values).mean()*(torch.abs(torch.tensor((len(group2_logit_data) - num)/(len(group1_logit_data)+len(group2_logit_data)))))*accuracy_penalty 
        else:
            return torch.log(group1_values).mean()*(torch.abs(torch.tensor((len(group2_logit_data) - num)/(len(group1_logit_data)+len(group2_logit_data)))))*accuracy_penalty 


def clustering_loss(group1_data, group2_data, group1_logit_data, group2_logit_data, num, alpha, gamma):
    return alpha * cohesion(group1_data) + gamma * classify_loss(group1_logit_data, group2_logit_data, num)

### Validation

In [None]:
def val_isNG(group1_data, group2_data):
    if ((len(group2_data) / (len(group1_data)+len(group2_data))) >= 0.0004525*10):
        return 1
    else:
        return 0

def true_isNG(group1_data, group2_data, num):
    if ((num / (len(group1_data)+len(group2_data))) >= 0.0004525*10):
        return 1
    else:
        return 0

### Model Architecture

In [None]:
class Cluster_model(nn.Module):

    def __init__(self):
        super(Cluster_model, self).__init__()
        self.fe = nn.Linear(selected_len, hidden_num)
        self.hidden1 = nn.Linear(hidden_num,hidden_num)
        self.hidden2 = nn.Linear(hidden_num,hidden_num)
        self.hidden3 = nn.Linear(hidden_num,hidden_num)
        self.latent = nn.Linear(hidden_num, hidden_num)
        self.relu = nn.ReLU()
        self.last = nn.Linear(hidden_num, 1)
        self.sig = nn.Sigmoid()


    def forward(self, x):
        x = self.fe(x)
        x = self.relu(x)
        x = self.hidden1(x)
        x = self.relu(x)
        x = self.hidden2(x)
        x = self.relu(x)
        x = self.hidden3(x)
        x = self.relu(x)
        f = self.latent(x)
        x = self.last(f)
        x = self.sig(x)
        return f, x

    

### Train

In [None]:
epochs = 3000
best_loss = 1000000
best_f1_score = 0
best_model = None   

device = 'cpu'
model = Cluster_model().to(device)
lr = 1e-4
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in tqdm(range(epochs)):
    loss = 0
    rand_i_list = random.sample(range(len(batch_quality['배정번호'])), k=10)
    optimizer.zero_grad()
    for i in range(10):
        rand_i = rand_i_list[i]
        cluster_idx = np.array(batch_quality['배정번호'])[rand_i]

        model.train()
        
        exec(f"f, y_hat = model(torch_data{cluster_idx})")
        loss += clustering_loss(f[np.array(y_hat > 0.5).reshape(-1)], f[np.array(y_hat <= 0.5).reshape(-1)], y_hat[np.array(y_hat > 0.5).reshape(-1)], y_hat[np.array(y_hat <= 0.5).reshape(-1)], eval(f"num{cluster_idx}"), 1, 10000)
    loss /= 10
    
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 9:
        val_loss = 0
        model.eval()
        pred_isNG_list = []
        true_isNG_list = []
        
        for cluster_idx in batch_quality_val['배정번호']:    
            exec(f"f, y_hat = model(torch_data{cluster_idx})")
            pred_isNG_list.append(val_isNG(f[np.array(y_hat > 0.5).reshape(-1)], f[np.array(y_hat <= 0.5).reshape(-1)]))
            true_isNG_list.append(true_isNG(f[np.array(y_hat > 0.5).reshape(-1)], f[np.array(y_hat <= 0.5).reshape(-1)], eval(f"num{cluster_idx}")))
           
        val_f1_score = f1_score(np.array(true_isNG_list).reshape(-1,1), np.array(pred_isNG_list).reshape(-1,1), average='macro')
        if best_f1_score < val_f1_score:
            best_f1_score = val_f1_score
            best_model = copy.deepcopy(model)
            torch.save(best_model,"best_model.pt")
            
            print(f"[epoch {epoch}] best_f1 : {best_f1_score}")
        

### Inference

In [None]:
model = torch.load('best_model.pt')
model.eval()

total_val_label = []
for cluster_idx in batch_quality_val['배정번호']:    
    exec(f"f, y_hat = model(torch_data{cluster_idx})")
    temp_label = np.array([0 for i in range(len(y_hat))])
    temp_label[np.array(y_hat <= 0.5).reshape(-1)] = 1
    total_val_label.append(temp_label.tolist())

total_test_label = []
for cluster_idx in batch_quality_test['배정번호']:    
    exec(f"f, y_hat = model(torch_data{cluster_idx})")
    temp_label = np.array([0 for i in range(len(y_hat))])
    temp_label[np.array(y_hat <= 0.5).reshape(-1)] = 1
    total_test_label.append(temp_label.tolist())

total_train_label = []
for cluster_idx in batch_quality['배정번호']:    
    exec(f"f, y_hat = model(torch_data{cluster_idx})")
    temp_label = np.array([0 for i in range(len(y_hat))])
    temp_label[np.array(y_hat <= 0.5).reshape(-1)] = 1
    total_train_label.append(temp_label.tolist())

total_label = total_train_label + total_val_label + total_test_label

In [None]:
model = torch.load('best_model.pt')
model.eval()

pred_isNG_list = []
true_isNG_list = []

for cluster_idx in batch_quality_val['배정번호']:    
    exec(f"f, y_hat = model(torch_data{cluster_idx})")
    pred_isNG_list.append(val_isNG(f[np.array(y_hat > 0.5).reshape(-1)], f[np.array(y_hat <= 0.5).reshape(-1)]))
    true_isNG_list.append(true_isNG(f[np.array(y_hat > 0.5).reshape(-1)], f[np.array(y_hat <= 0.5).reshape(-1)], eval(f"num{cluster_idx}")))
   
val_f1_score = f1_score(np.array(true_isNG_list).reshape(-1,1), np.array(pred_isNG_list).reshape(-1,1),average='macro')

pred_isNG_list = []
true_isNG_list = []

for cluster_idx in batch_quality_test['배정번호']:    
    exec(f"f, y_hat = model(torch_data{cluster_idx})")
    pred_isNG_list.append(val_isNG(f[np.array(y_hat > 0.5).reshape(-1)], f[np.array(y_hat <= 0.5).reshape(-1)]))
    true_isNG_list.append(true_isNG(f[np.array(y_hat > 0.5).reshape(-1)], f[np.array(y_hat <= 0.5).reshape(-1)], eval(f"num{cluster_idx}")))
   
test_f1_score = f1_score(np.array(true_isNG_list).reshape(-1,1), np.array(pred_isNG_list).reshape(-1,1),average='macro')

print(val_f1_score, test_f1_score)

cm = confusion_matrix(np.array(true_isNG_list).reshape(-1,1), np.array(pred_isNG_list).reshape(-1,1))
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
data1 = pd.read_csv("./preprocess_data.csv")

In [None]:
data1['위험군'] = 0
for i in range(len(quality)):
    cluster_idx = quality['배정번호'].iloc[i]
    data1.loc[data1['배정번호']==cluster_idx, '위험군'] = np.array(total_label[i]).reshape(-1,1)

In [None]:
quality.to_csv("final_quality.csv",index=False)
data1.to_csv("final_data.csv",index=False)