# Data preprocessing

In [71]:
import re
import pandas as pd
from glob import glob


files = glob('메뉴데이터/*.csv')

def clear_menu(menu):
    patterns = [
        ('\([^(]*\)', ''),
        ('[0-9].*', ''),
        ('\u119e', ''),
        ('\xe9', '')
    ]
    
    for pat, change in patterns:
        menu = re.sub(pat, change, menu)
    return menu

dfs = []
for file in files:
    try: 
        df = pd.read_csv(file)[['메뉴명', '식당(ID)']]
        df = df.rename(columns={'식당(ID)' : '식당ID'})
    except:
        df = pd.read_csv(file)[['메뉴명', '식당ID']]
        
    df['메뉴명'] = df['메뉴명'].apply(lambda x: clear_menu(x))
    dfs.append(df)
    
for idx, df in enumerate(dfs):
    df['식당ID'] = df['식당ID'].apply(lambda x: f'{idx}_{x}')
df = pd.concat(dfs)

typos = [
    ('복음', '볶음'),
    ('짬뽕면', '짬뽕'),
]

for typo in typos:
    df['메뉴명'] = df['메뉴명'].apply(lambda x : x.replace(typo[0], typo[1]))

removals = ['추가', '사리', '넓적당면', '넙적당면', '삶은달걀', '메추리알', '쫀득치즈', '토핑', '세트']
for removal in removals:
    df = df[~df['메뉴명'].str.contains(removal)].reset_index(drop=True)
df = df.drop_duplicates()

menus = df.groupby('식당ID')['메뉴명'].apply(list)
with open('menus.txt', mode='w', encoding='utf-8') as f:
    for menu in menus:
        f.write(' '.join(menu)+'\n')

  df = pd.read_csv(file)[['메뉴명', '식당(ID)']]


# Fasttext 

In [173]:
import fasttext
model = fasttext.train_unsupervised('menus.txt', model='skipgram', dim=10)

In [234]:
model.save_model('fasttext.bin')

In [None]:
'''
fasttext 구함 -> 테이블에서 단어가 포함되는 요리명 구함 -> recipe2vec에서 유사도 구함 ->
유사도가 너무 높은 경우 제외 -> 다음 단어 확인.. -> recipe2vec 유사도가 낮은 경우 궁합 요리로 추가
'''

# make recipe2combi vec

In [196]:
import joblib

recipe = joblib.load('crawling.df')
recipe2combi = recipe[['레시피일련번호', '요리명']].copy()
recipe2combi = recipe2combi.set_index('레시피일련번호')
recipe2combi['요리명'] = recipe2combi['요리명'].apply(lambda x: model[x])
recipe2combi = recipe2combi.to_dict()['요리명']
joblib.dump(recipe2combi, 'recipe2combi.dict')

['recipe2combi.dict']