# csm 테이블 생성

csm 테이블은 분석 가능한 모든 캐릭터와 모든 캐릭터 사이의 유사성을 미리 계산해
놓은 테이블입니다.


In [75]:
import ast
import calendar
import datetime
from functools import partial
import json
from operator import itemgetter
from pprint import pprint

import numpy as np
import pandas as pd

In [2]:
chars = pd.read_excel(
  './datasets/characters-similarity.xlsx',
  sheet_name='Table1'
)

def settify(expr: str) -> set:
  return set(ast.literal_eval(expr))

chars['styles'] = chars['styles'].apply(settify)
chars['colors'] = chars['colors'].apply(settify)
print(chars['styles'].head())
print(chars['colors'].head())

0           {쿨}
1     {액티브, 심플}
2    {심플, 엘레강스}
3         {액티브}
4          {심플}
Name: styles, dtype: object
0    {파랑색, 검정색}
1    {파랑색, 검정색}
2    {핑크색, 하양색}
3    {하양색, 빨강색}
4    {빨강색, 파랑색}
Name: colors, dtype: object


In [4]:
chars.index = chars['id']

In [6]:
chars = chars.drop(
  columns=[
    'id',
    'name_en',
    'name_ko',
    'image_icon',
    'species',
    'gender',
    'birthday_month',
    'birthday_day'
  ]
)

In [55]:
chars.index = chars.index.astype(str)

In [57]:
chars.loc['nan']

birthday             08-24
personality            친절함
hobby                   자연
styles          {심플, 엘레강스}
colors         {노랑색, 오렌지색}
Name: nan, dtype: object

## 필요한 함수 정의

`TODAY_YEAR`를 직접 구하려고 하면 2022년이 윤년이 아니라서 생일이 2월 29일인 캐릭터는 에러가 납니다.

실제로는 몽셰르는 특수 npc라서 지금 다루는 테이블에서는 빠져 있습니다. 하지만 서비스에 들어가면 생일을 2월 29일로 입력하는 사람이 있을 수도 있기 때문에 유사도를 윤년 기준으로 평가하는 것이 좋겠습니다.


In [80]:
# TODAY_YEAR = datetime.date.today().year
TODAY_YEAR = 2020   # 윤년
TODAY_LEAP = calendar.isleap(TODAY_YEAR)
DAYMOD = 183.0 if TODAY_LEAP else 182.5

def compare_simple(a, b):
  return 0.0 if a == b else 1.0

def compare_set(a: set, b: set):
  return 1.0 - len(a & b)/max(len(a), len(b))

def to_yday(bdstr: str) -> int:
  m, d = map(int, bdstr.split('-'))
  return datetime.date(TODAY_YEAR, m, d).timetuple().tm_yday
  
def compare_yday(a: str, b: str):
  a = to_yday(a)
  b = to_yday(b)
  delta = abs(a - b)
  if delta < DAYMOD:
    result = abs(delta)
  else:
    result = DAYMOD*2 - delta
  return result / DAYMOD


In [81]:
table = pd.DataFrame(index=chars.index, columns=chars.index)
table.head()

id,admiral,agents,agnes,al,alfonso,alice,alli,amelia,anabelle,anchovy,...,wartjr,weber,wendy,whitney,willow,winnie,wolfgang,yuka,zell,zucker
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
admiral,,,,,,,,,,,...,,,,,,,,,,
agents,,,,,,,,,,,...,,,,,,,,,,
agnes,,,,,,,,,,,...,,,,,,,,,,
al,,,,,,,,,,,...,,,,,,,,,,
alfonso,,,,,,,,,,,...,,,,,,,,,,


In [82]:
for id in table.columns:
  ref = chars.loc[id]
  vectors = pd.DataFrame(
    index=chars.index,
    columns = ['birthday', 'hobby', 'personality', 'colors', 'styles']
  )
  compare_hobby = partial(compare_simple, ref.hobby)
  compare_personality = partial(compare_simple, ref.personality)
  compare_colors = partial(compare_set, ref.colors)
  compare_styles = partial(compare_set, ref.styles)
  compare_birthday = partial(compare_yday, ref.birthday)

  vectors['hobby'] = chars['hobby'].apply(compare_hobby)
  vectors['personality'] = chars['personality'].apply(compare_personality)
  vectors['colors'] = chars['colors'].apply(compare_colors)
  vectors['styles'] = chars['styles'].apply(compare_styles)
  vectors['birthday'] = chars['birthday'].apply(compare_birthday)
  vectors['distance'] = vectors.apply(
    lambda row: np.sqrt(sum(row**2)), axis=1
  )

  table[id] = vectors['distance']

In [83]:
table.loc['ike']

id
admiral     0.781865
agents      1.820978
agnes       2.004660
al          2.171958
alfonso     1.807540
              ...   
winnie      1.893545
wolfgang    1.547158
yuka        1.837433
zell        1.806780
zucker      1.546663
Name: ike, Length: 391, dtype: float64

In [84]:
table.head()

id,admiral,agents,agnes,al,alfonso,alice,alli,amelia,anabelle,anchovy,...,wartjr,weber,wendy,whitney,willow,winnie,wolfgang,yuka,zell,zucker
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
admiral,0.0,1.932882,2.053227,2.074755,1.94581,2.184952,2.047219,1.625474,1.806085,2.010194,...,1.733466,1.40264,2.015677,1.737898,2.028493,1.802908,1.272209,2.040706,1.941724,1.51664
agents,1.932882,0.0,1.845209,1.611922,1.586126,2.017126,2.120592,1.958384,1.435428,1.918331,...,1.604572,1.50004,1.748659,1.851227,2.155286,1.717267,1.771019,2.002417,2.00466,1.703468
agnes,2.053227,1.845209,0.0,2.053651,1.523711,1.918331,2.015677,1.791137,1.837433,1.52276,...,1.922094,1.84291,2.098048,1.978114,1.909256,1.641924,2.157328,1.868655,2.016423,1.818739
al,2.074755,1.611922,2.053651,0.0,1.662058,1.832348,2.003289,1.811236,2.106463,1.889091,...,2.024957,1.833388,1.836385,2.007161,1.815329,1.891881,1.814695,2.05958,2.127958,1.897922
alfonso,1.94581,1.586126,1.523711,1.662058,0.0,1.844052,2.165617,2.189375,1.699432,1.131794,...,1.552136,1.230109,1.839577,1.883774,2.205214,1.93769,2.202918,2.01251,2.00003,1.325995


## 자료를 csv와 json으로 내보냅니다.


### csv

아마 필요 없겠지만 그냥 혹시


In [85]:
table.to_csv('./datasets/csm.csv')

### json

먼저 `dict`로 바꿔서 처리를 좀 해줘야 합니다.


In [86]:
csm = table.to_dict(orient='dict')

In [87]:
# for k, v in csm['admiral'].items():
#   print(f'{repr(k)}: {v}')

# after = set(csm['admiral'].keys())
# before = set(table.index)
# print(after - before)
# print(before - after)

for k in csm['admiral'].keys():
  if type(k) != str:
    print(type(k))


In [88]:
for k, v in csm.items():
  del csm[k][k]
  _ = sorted(csm[k].items(), key=itemgetter(1))
  _ = map(lambda pair: {'id': pair[0], 'distance': pair[1]}, _)
  csm[k] = list(_)

In [89]:
with open('./datasets/csmdata.json', 'w', encoding='utf-8') as j_out:
  json.dump(csm, j_out)