In [1]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 패키지 import

In [2]:
# 필요한 패키지 임포트
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import torch
warnings.filterwarnings('ignore')

In [3]:
!pip install faiss-cpu
import faiss



# 1-1. Import Data

In [4]:
data = pd.read_csv('/content/drive/MyDrive/ChecKHUMate/merge_domitory_data.csv')

# 1-2. Data preprocessing

In [5]:
data = data.set_index('user_id')

In [6]:
# 데이터가 많이 부족한 alarm, activity, birth, student_id.1(상대방), major.1(상대방), smoke.1(상대방) 열 drop

data.drop(columns = ['alarm', 'activity', 'birth', 'student_id.1', 'major.1', 'smoke.1'])

Unnamed: 0_level_0,domitory,age,student_id,gender,major,bedtime,clean_duration,smoke,alcohol,mbti
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0.0,0,1,2,2,0,0.0,0,ENTJ
2,0,3.0,3,1,2,2,1,0.0,1,ISFP
3,0,3.0,3,1,2,2,0,1.0,1,ESTJ
4,0,0.0,0,1,2,2,1,0.0,0,ISFJ
5,0,3.0,3,0,0,2,0,0.0,0,ISFJ
...,...,...,...,...,...,...,...,...,...,...
129,0,4.0,4,1,3,3,1,0.0,0,ISFP
130,4,3.0,3,0,1,3,0,0.0,1,ISFJ
131,0,2.0,3,1,2,3,2,0.0,0,INFP
132,0,3.0,3,1,2,3,0,0.0,1,INTJ


In [7]:
# 범주형 데이터 to 수치형 벡터 (one-hot encoding)

data_one_hot = {
  'domitory' : [0, 1, 2, 3, 4],
  'age' : [0, 1, 2, 3, 4],
  'student_id' : [0, 1, 2, 3, 4],
  'gender' : [0, 1],
  'major' : [0, 1, 2, 3, 4, 5],
  'bedtime' : [0, 1, 2, 3, 4],
  'clean_duration' : [0, 1, 2],
  'smoke' : [0, 1],
  'mbti' : ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ESFJ', 'ENFJ', 'ENTJ']
}

encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(data).toarray()

# 2. Data Filtering & Modeling

## Faiss 유사도

In [8]:
# 벡터의 차원
d = encoded_features.shape[1]

# L2 거리를 사용하는 인덱스 생성
index = faiss.IndexFlatL2(d)
# 인덱스에 데이터 추가
index.add(encoded_features.astype('float32'))

# 가장 가까운 이웃의 수 (임의 설정)
k = 5

# 쿼리 벡터에 가장 가까운 이웃 검색
distances, indices = index.search(encoded_features.astype('float32'), k)

In [9]:
print(indices[0])

[ 0  3  9 51 57]


## 사용자 가중치 별 sort 함수

In [10]:
import numpy as np

class FeatureSorter:
    def __init__(self, encoded_data):
        self.encoded_data = encoded_data
        self.num_features = encoded_data.shape[1]

    def sort_features_by_weight(self, weights):
        # 가중치 배열의 크기를 feature 개수에 맞게 조정
        weights = np.pad(weights, (0, self.num_features - len(weights)), mode='constant')
        # feature들의 가중치를 계산
        weighted_sum = np.dot(self.encoded_data, weights)
        # 가중치를 기반으로 feature들을 정렬한 인덱스를 반환
        sorted_indices = np.argsort(weighted_sum)[::-1]  # 내림차순으로 정렬
        return sorted_indices

In [11]:
# 예시로 사용자가 설정한 가중치
user_weights = np.array([0.8, 0.5, 0.6, 0.3, 0.9, 0.7, 0.4, 0.2])

# FeatureSorter 클래스를 이용하여 feature를 정렬
feature_sorter = FeatureSorter(encoded_features)
sorted_indices = feature_sorter.sort_features_by_weight(user_weights)

# 정렬된 feature들의 인덱스 출력
print("Sorted feature indices:", sorted_indices)

Sorted feature indices: [  0   3  15  10   9  39  52  32  53  57  33  82 114  22  23  24  25  40
  29  14  30 124  71  62 101  54  26 126  46 112  51  65  63  80  85  69
  76  74  60  59  45  58  20 130 127  31 129  43  41  27  35  37  36  98
  38  21  49  34   1   2   4   5   6   7  47   8  11  12  13  28  17  42
  18  19  16 132  66  73 110 115 121  86  84 123 125  77 122  72  68  61
 131 128  67  70  64  81  48  44  99  75  93 107 111 108 109 118 113 116
 117 119 120 105 106  94 104  88 102 100  97  96  87  92 103  90  50  56
  89  55  95  83  79  78  91]


### KNN

In [12]:
# KNN
class KNN():
    # 초기에 k 값을 지정
    def __init__(self,k=3):
        self.k=k

    # Train: Train데이터 모두 memory에 저장
    def train(self,X,y):
        self.X=X
        self.y=y

    # train data와의 distance 계산
    # - X_test:  (N, D)
    # - X_train: (M, D)
    # - dists:   (N, M) / test point별로 train point간의 거리행렬
    def get_distance(self, X_test, method="L2"):
        N = len(X_test)
        M = len(self.X)
        X_train_t = np.tile(self.X, (N,1))
        X_test_t = np.repeat(X_test, M, axis=0)

        if method=="L2":
            distance = np.sqrt(np.sum((X_train_t-X_test_t)**2,axis=1)).reshape(N,M)
        return distance

    # 다수결 투표를 통한 label 도출
    def predict(self, X_test):
        N = len(X_test)
        y_hat = np.zeros(N)

        distance = self.get_distance(X_test, method="L2")
        arg_dist = distance.argsort()
        for i in range(N):
            row = arg_dist[i]
            k_neighbor = self.y[row[:self.k]]
            target, cnt = np.unique(k_neighbor, return_counts=True)
            y_hat[i] = target[np.argmax(cnt)]
        return y_hat

### NCF

In [13]:
# NCF(Neural Collaborative Filtering)
class NCF(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.num_users = config["num_users"]
        self.num_items = config["num_items"]
        self.latent_dim_mf = config["latent_dim_mf"]
        self.latent_dim_mlp = config["latent_dim_mlp"]

        # Input
        self.embedding_user_mlp = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mlp)
        self.embedding_item_mlp = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mlp)
        self.embedding_user_mf = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mf)
        self.embedding_item_mf = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mf)

        # Layer
        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(config["layers"][:-1], config["layers"][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        # Output
        self.last_layer = torch.nn.Linear(in_features=(config["layers"][-1] + self.latent_dim_mf), out_features=1)
        self.output_layer = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        # GMF
        gmf_layer = torch.mul(user_embedding_mf, item_embedding_mf)

        # MLP
        mlp_concat_layer = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
        for idx in range(len(self.fc_layers)):
            mlp_concat_layer = self.fc_layers[idx](mlp_concat_layer)
            mlp_concat_layer = torch.nn.ReLU()(mlp_concat_layer)

        # Concatenate
        neu_mf_layer = torch.cat([gmf_layer, mlp_concat_layer], dim=-1)

        # Output
        return self.output_layer(self.last_layer(neu_mf_layer)).view(-1)