# csm 테이블 생성

csm 테이블은 분석 가능한 모든 캐릭터와 모든 캐릭터 사이의 유사성을 미리 계산해
놓은 테이블입니다.


In [75]:
import ast
import calendar
import datetime
from functools import partial
import json
from operator import itemgetter
from pprint import pprint

import numpy as np
import pandas as pd

In [2]:
chars = pd.read_excel(
  './datasets/characters-similarity.xlsx',
  sheet_name='Table1'
)

def settify(expr: str) -> set:
  return set(ast.literal_eval(expr))

chars['styles'] = chars['styles'].apply(settify)
chars['colors'] = chars['colors'].apply(settify)
print(chars['styles'].head())
print(chars['colors'].head())

0           {쿨}
1     {액티브, 심플}
2    {심플, 엘레강스}
3         {액티브}
4          {심플}
Name: styles, dtype: object
0    {파랑색, 검정색}
1    {파랑색, 검정색}
2    {핑크색, 하양색}
3    {하양색, 빨강색}
4    {빨강색, 파랑색}
Name: colors, dtype: object


In [4]:
chars.index = chars['id']

In [6]:
chars = chars.drop(
  columns=[
    'id',
    'name_en',
    'name_ko',
    'image_icon',
    'species',
    'gender',
    'birthday_month',
    'birthday_day'
  ]
)

In [55]:
chars.index = chars.index.astype(str)

In [57]:
chars.loc['nan']

birthday             08-24
personality            친절함
hobby                   자연
styles          {심플, 엘레강스}
colors         {노랑색, 오렌지색}
Name: nan, dtype: object

## 필요한 함수 정의


In [8]:
TODAY_YEAR = datetime.date.today().year
TODAY_LEAP = calendar.isleap(TODAY_YEAR)
DAYMOD = 183.0 if TODAY_LEAP else 182.5

def compare_simple(a, b):
  return 0.0 if a == b else 1.0

def compare_set(a: set, b: set):
  return 1.0 - len(a & b)/max(len(a), len(b))

def to_yday(bdstr: str) -> int:
  m, d = map(int, bdstr.split('-'))
  return datetime.date(TODAY_YEAR, m, d).timetuple().tm_yday
  
def compare_yday(a: str, b: str):
  a = to_yday(a)
  b = to_yday(b)
  delta = abs(a - b)
  if delta < DAYMOD:
    result = abs(delta)
  else:
    result = DAYMOD*2 - delta
  return result / DAYMOD


In [59]:
table = pd.DataFrame(index=chars.index, columns=chars.index)
table.head()

id,admiral,agents,agnes,al,alfonso,alice,alli,amelia,anabelle,anchovy,...,wartjr,weber,wendy,whitney,willow,winnie,wolfgang,yuka,zell,zucker
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
admiral,,,,,,,,,,,...,,,,,,,,,,
agents,,,,,,,,,,,...,,,,,,,,,,
agnes,,,,,,,,,,,...,,,,,,,,,,
al,,,,,,,,,,,...,,,,,,,,,,
alfonso,,,,,,,,,,,...,,,,,,,,,,


In [60]:
for id in table.columns:
  ref = chars.loc[id]
  vectors = pd.DataFrame(
    index=chars.index,
    columns = ['birthday', 'hobby', 'personality', 'colors', 'styles']
  )
  compare_hobby = partial(compare_simple, ref.hobby)
  compare_personality = partial(compare_simple, ref.personality)
  compare_colors = partial(compare_set, ref.colors)
  compare_styles = partial(compare_set, ref.styles)
  compare_birthday = partial(compare_yday, ref.birthday)

  vectors['hobby'] = chars['hobby'].apply(compare_hobby)
  vectors['personality'] = chars['personality'].apply(compare_personality)
  vectors['colors'] = chars['colors'].apply(compare_colors)
  vectors['styles'] = chars['styles'].apply(compare_styles)
  vectors['birthday'] = chars['birthday'].apply(compare_birthday)
  vectors['distance'] = vectors.apply(
    lambda row: np.sqrt(sum(row**2)), axis=1
  )

  table[id] = vectors['distance']

In [61]:
table.loc['ike']

id
admiral     0.778922
agents      1.821078
agnes       2.004686
al          2.172864
alfonso     1.807566
              ...   
winnie      1.892358
wolfgang    1.545394
yuka        1.837622
zell        1.806802
zucker      1.546915
Name: ike, Length: 391, dtype: float64

In [40]:
table[table.isnull()]

id,admiral,agents,agnes,al,alfonso,alice,alli,amelia,anabelle,anchovy,...,wartjr,weber,wendy,whitney,willow,winnie,wolfgang,yuka,zell,zucker
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
admiral,,,,,,,,,,,...,,,,,,,,,,
agents,,,,,,,,,,,...,,,,,,,,,,
agnes,,,,,,,,,,,...,,,,,,,,,,
al,,,,,,,,,,,...,,,,,,,,,,
alfonso,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
winnie,,,,,,,,,,,...,,,,,,,,,,
wolfgang,,,,,,,,,,,...,,,,,,,,,,
yuka,,,,,,,,,,,...,,,,,,,,,,
zell,,,,,,,,,,,...,,,,,,,,,,


## 자료를 csv와 json으로 내보냅니다.


### csv

아마 필요 없겠지만 그냥 혹시


In [62]:
table.to_csv('./datasets/csm.csv')

### json

먼저 `dict`로 바꿔서 처리를 좀 해줘야 합니다.


In [72]:
csm = table.to_dict(orient='dict')

In [66]:
# for k, v in csm['admiral'].items():
#   print(f'{repr(k)}: {v}')

# after = set(csm['admiral'].keys())
# before = set(table.index)
# print(after - before)
# print(before - after)

for k in csm['admiral'].keys():
  if type(k) != str:
    print(type(k))


In [73]:
for k, v in csm.items():
  del csm[k][k]
  _ = sorted(csm[k].items(), key=itemgetter(1))
  _ = map(lambda pair: {'id': pair[0], 'distance': pair[1]}, _)
  csm[k] = list(_)

In [77]:
with open('./datasets/csmdata.json', 'w', encoding='utf-8') as j_out:
  json.dump(csm, j_out)