In [1]:
# 필요한 패키지 설치 (코드셀 1)
!pip install konlpy
!pip install mecab-python3
!sudo apt-get install mecab libmecab-dev mecab-ipadic mecab-ipadic-utf8
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

# konlpy의 Mecab 클래스 import (코드셀 1)
from konlpy.tag import Mecab

# 사전이 설치된 경로를 찾기 (코드셀 1)
!find / -name mecab-ko-dic 2>/dev/null

# 찾아낸 사전 경로를 바탕으로 Mecab 형태소 분석기 초기화 (코드셀 1)
dicpath = '/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ko-dic'  # 찾아낸 사전 경로
mecab = Mecab(dicpath)

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0
Collecting mecab-python3
  Downloading mecab_python3-1.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.7/581.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.8
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The f

In [16]:
# 환경 변수 설정 (필요한 경우)
import os
os.environ['MECABRC'] = '/etc/mecabrc'

# 이후 코드 (import 문 및 나머지 코드)
import pandas as pd
from konlpy.tag import Mecab
from collections import defaultdict
import math  # math 모듈을 추가
import numpy as np

# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

# 조리 과정별 파일 경로
cooking_processes = {
    '끓이기': '/content/drive/My Drive/recipe/끓이기1000_2.csv',
    '굽기': '/content/drive/My Drive/recipe/굽기1000_2.csv',
    '데치기': '/content/drive/My Drive/recipe/데치기1000_2.csv',
    '무침': '/content/drive/My Drive/recipe/무침1000_2.csv',
    '볶음': '/content/drive/My Drive/recipe/볶음1000_2.csv',
    '부침': '/content/drive/My Drive/recipe/부침1000_2.csv',
    '비빔': '/content/drive/My Drive/recipe/비빔1000_2.csv',
    '삶기': '/content/drive/My Drive/recipe/삶기1000_2.csv',
    '절임': '/content/drive/My Drive/recipe/절임1000_2.csv',
    '조림': '/content/drive/My Drive/recipe/조림1000_2.csv',
    '찜': '/content/drive/My Drive/recipe/찜1000_2.csv',
    '튀김': '/content/drive/My Drive/recipe/튀김1000_2.csv',
    '회': '/content/drive/My Drive/recipe/회1000_2.csv'
}

# 각 조리 과정별로 사용된 동사와 빈도수를 저장할 defaultdict
process_verbs = defaultdict(lambda: defaultdict(int))
all_recipes = pd.DataFrame()

# 각 조리 과정별 파일을 읽어서 사용된 동사 추출 및 빈도수 계산
for process, path in cooking_processes.items():
    try:
        data = pd.read_csv(path, encoding='CP949')
        data['process'] = process  # 조리 과정 열 추가
        data['recipe'] = data['recipe'].astype(str)  # 문자열 변환
        all_recipes = pd.concat([all_recipes, data], ignore_index=True)
        for recipe in data['recipe']:
            verbs = [word for word, tag in mecab.pos(recipe) if tag.startswith('VV')]
            for verb in verbs:
                process_verbs[process][verb] += 1
    except FileNotFoundError:
        print(f"파일 {path}을(를) 찾을 수 없습니다.")
    except Exception as e:
        print(f"파일 {path}을(를) 읽는 중 다음과 같은 에러가 발생했습니다: {e}")

# 전체 조리 과정에서 사용된 동사의 빈도수 계산
total_verbs = defaultdict(int)
for verbs in process_verbs.values():
    for verb, count in verbs.items():
        total_verbs[verb] += count

# 전체 레시피 수
total_recipes = len(all_recipes)

# 조건부 확률 및 TF-IDF 계산
conditional_probabilities = {}
tf_idf_scores = {}
for process, verbs in process_verbs.items():
    # 조건부 확률 계산
    conditional_probabilities[process] = {verb: count / total_verbs[verb] for verb, count in verbs.items()}
    # TF-IDF 계산
    total_verb_count = sum(verbs.values())
    tf_scores = {verb: count / total_verb_count for verb, count in verbs.items()}
    idf_scores = {verb: math.log(total_recipes / total_verbs[verb]) for verb in verbs}
    tf_idf_scores[process] = {verb: max(tf * idf_scores[verb], 0) for verb, tf in tf_scores.items()}

# 선택된 조리 과정
selected_processes = ['회']  # 여기에 원하는 과정을 추가

# 각 선택된 과정에 대한 처리 및 결과 출력
for process in selected_processes:
    # 표준편차 계산
    cond_std = np.std(list(conditional_probabilities[process].values()))
    tf_idf_std = np.std(list(tf_idf_scores[process].values()))

    # 조건부확률과 TF-IDF의 표준편차 역수를 가중치로 사용
    weighted_averages = {}
    for verb in process_verbs[process]:
        weighted_avg = (conditional_probabilities[process][verb] / cond_std +
                        tf_idf_scores[process][verb] / tf_idf_std) / (1/cond_std + 1/tf_idf_std)
        weighted_averages[verb] = weighted_avg

    # 가중평균값이 높은 순으로 정렬
    sorted_averages = sorted(weighted_averages.items(), key=lambda x: x[1], reverse=True)

    # 최종 결과 출력
    print(f"\n{process} 과정의 결과:")
    for verb, weighted_avg in sorted_averages:
        print(f"{verb}, {conditional_probabilities[process].get(verb, 0)}, {tf_idf_scores[process].get(verb, 0)}, {1/cond_std}, {1/tf_idf_std}, {weighted_avg}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

회 과정의 결과:
올려, 0.10486280901928824, 0.04145246673802792, 3.6058142792439534, 306.41387410847824, 0.042189987373696586
올리, 0.0994550408719346, 0.027804252838240753, 3.6058142792439534, 306.41387410847824, 0.02863761744148447
찍, 0.12831858407079647, 0.022640044461763167, 3.6058142792439534, 306.41387410847824, 0.023869183130709043
깔, 0.09950738916256158, 0.022601532094507017, 3.6058142792439534, 306.41387410847824, 0.02349601798672775
얹, 0.11603650586701435, 0.02216879717782982, 3.6058142792439534, 306.41387410847824, 0.023260565013968247
있, 0.050809716599190285, 0.020284013913946446, 3.6058142792439534, 306.41387410847824, 0.020639055928209585
말, 0.08599779492833518, 0.018247413244785547, 3.6058142792439534, 306.41387410847824, 0.019035412532731565
썬, 0.062420382165605096, 0.018068393814083702, 3.6058142792439534, 306.41387410847824, 0.018584248254627393
뿌려, 0