In [None]:
#20살

In [7]:
import sys
import os
import json
from collections import Counter
from konlpy.tag import Okt, Komoran, Hannanum, Kkma, Mecab
import tkinter as tk
from tkinter import filedialog, messagebox, ttk, StringVar
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib import font_manager, rc
from matplotlib.ticker import FuncFormatter
from openpyxl import Workbook
import ijson
import csv

In [8]:
# 단어 통계를 위한 Counter 객체 생성
word_counter = Counter()

mecab = Mecab(dicpath=r"C:/mecab/mecab-ko-dic")

# 형태소 분석기 초기화
morpheme_analyzers = {
    "선택 없음": None,
    "Okt": Okt(),
    "Komoran": Komoran(),
    "Hannanum": Hannanum(),
    "Kkma": Kkma(),
    'Mecab': mecab
}

In [4]:
# 폰트 설정
if getattr(sys, 'frozen', False):  # 코드가 PyInstaller로 패키징된 경우
    base_path = sys._MEIPASS
else:
    base_path = os.path.dirname(__file__)

font_path = os.path.join(base_path, "fonts", "malgun.ttf")
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)

NameError: name '__file__' is not defined

In [9]:
def generate_ngrams(s, n):
    # Input: s = string, n = size of the ngram
    # Output: list of ngrams
    tokens = s.split()
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [10]:
def analyze_folder():
    global word_counter, all_sentences

    folder_path = filedialog.askdirectory()
    if not folder_path:
        return

    user_input_path = user_input.get()
    path_elements = user_input_path.split('.')

    # GUI에서 필터링할 key와 value를 가져옵니다.
    user_target_key = filter_key_entry.get()
    user_target_value = filter_value_entry.get()

    # 폴더 내 json 파일 개수를 세어 프로그레스바 최대값 설정
    total_files = len([name for name in os.listdir(folder_path) if name.endswith('.json')])
    progress_bar['maximum'] = total_files

    # 초기화
    word_counter = Counter()
    all_sentences = {'original': [], 'analyzed': []}
    reset_table()

    processed_files = 0
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                analyze_data(data, path_elements, user_target_key, user_target_value)

            # 파일 하나 처리가 끝나면 프로그레스바를 업데이트
            processed_files += 1
            progress_bar['value'] = processed_files
            root.update_idletasks()

    # 사용자 입력에서 추출 수 가져오기
    extract_count = extract_cnt_entry.get()
    if extract_count:
        n = int(extract_count)
        sorted_items = sorted(word_counter.items(), key=lambda item: item[1]['count'], reverse=True)[:n]
    else:
        sorted_items = sorted(word_counter.items(), key=lambda item: item[1]['count'], reverse=True)

    # 결과값이 있는지 확인
    if not sorted_items:
        messagebox.showinfo("Information", "조건에 해당하는 결과값이 없습니다!")
        return

    # 결과 CSV 파일 경로 설정
    word_csv_path = os.path.join(folder_path, "word_count.csv")
    sentence_csv_path = os.path.join(folder_path, "sentence_analysis.csv")

    # 단어 카운터를 CSV 파일로 저장
    with open(word_csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Word", "Count"])
        for word, count_data in sorted_items:
            writer.writerow([word, count_data['count']])

    # 문장 분석 결과를 CSV 파일로 저장
    with open(sentence_csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Original", "Analyzed"])
        for original, analyzed in zip(all_sentences['original'], all_sentences['analyzed']):
            writer.writerow([original, analyzed])

    plot_data()  # Plot the data

    messagebox.showinfo("Information", f"형태소 분석이 완료되었습니다! 결과가 '{folder_path}'에 저장되었습니다.")

In [11]:
def reset_table():
    global canvas, word_counter

    # Remove previous canvas if exists
    if canvas is not None:
        canvas.get_tk_widget().pack_forget()
        canvas = None

    # 표를 비웁니다.
    # result_tree.delete(*result_tree.get_children())

    # 단어 카운터를 초기화합니다.
    word_counter = Counter()

    # 프로그레스바를 0으로 초기화합니다.
    progress_bar['value'] = 0

In [12]:
def update_user_input(*args):
    user_input.delete(0, 'end')
    user_input.insert(0, tag_options[tag_variable.get()])


def get_selected_data():
    output_type = output_option_combobox.get()
    data = []

    if output_type == "단어":
        for item in result_tree.get_children():
            word, count = result_tree.item(item, 'values')
            data.append((word, count))
    elif output_type == "문장":
        for original, analyzed in zip(all_sentences['original'], all_sentences['analyzed']):
            data.append((original, analyzed))

    return data

In [13]:
def save_to_excel():
    selected_data = get_selected_data()

    file_path = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV files", "*.csv"), ("All Files", "*.*")])
    if file_path:
        with open(file_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
            writer = csv.writer(csvfile)

            # 헤더 작성
            if output_option_combobox.get() == "단어":
                writer.writerow(["Word", "Count"])
            else:  # 문장
                writer.writerow(["Original", "Analyzed"])

            # 데이터 작성
            for data in selected_data:
                writer.writerow(data)

        if messagebox.askyesno("Information", "저장되었습니다. 파일을 열겠습니까?"):
            os.startfile(file_path)

In [14]:
def save_to_txt():
    selected_data = get_selected_data()

    file_path = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt"), ("All Files", "*.*")])

    if file_path:
        with open(file_path, 'w', encoding="utf-8") as f:
            if output_option_combobox.get() == "단어":
                f.write("Word\tCount\n")
                for word, count in selected_data:
                    f.write(f"{word}\t{count}\n")
            else:  # 문장
                f.write("Original\tAnalyzed\n")
                for original, analyzed in selected_data:
                    f.write(f"{original}\t{analyzed}\n")

        if messagebox.askyesno("Information", "저장되었습니다. 파일을 열겠습니까?"):
            os.startfile(file_path)

In [15]:
def only_numbers(char):
    return char.isdigit()

In [16]:
# Action: 스타일 설정
def configure_styles():
    style = ttk.Style()
    style.theme_use('clam')  # 클램 테마는 더 현대적인 느낌을 줍니다.
    style.configure('TLabel', font=('Arial', 10), background='white')
    style.configure('TEntry', font=('Arial', 10), padding=5)
    style.configure('TButton', font=('Arial', 10), padding=5)
    style.configure('TCombobox', font=('Arial', 10), padding=5)
    style.map('TCombobox', fieldbackground=[('readonly', 'white')],
              selectbackground=[('readonly', 'white')],
              selectforeground=[('readonly', 'black')])
    style.configure('TFrame', background='white')  # 프레임 배경색 설정
    style.configure('Horizontal.TProgressbar', background='#FA8072')

In [18]:
# Action: GUI 생성

root = tk.Tk()
root.title("n-gram 및 형태소 분석기 v1.1")
# 프로그램의 고정된 크기
program_width = 670
program_height = 850

# 화면의 중앙에 프로그램이 위치하도록 좌표를 계산합니다.
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
center_x = int((screen_width - program_width) / 2)
center_y = int((screen_height - program_height) / 2)

# 프로그램의 위치와 크기를 설정합니다.
root.geometry(f'{program_width}x{program_height}+{center_x}+{center_y}')

configure_styles()

# 숫자 입력 확인을 위한 유효성 검사 커맨드 생성
vcmd = root.register(only_numbers)

# (1,1)
tag_label = tk.Label(root, text="* 태그", anchor='w')
tag_label.grid(row=0, column=0, sticky='we', padx=10, pady=5)

# (1,2)
user_input = ttk.Entry(root)
user_input.grid(row=0, column=1, sticky='ew', padx=10, pady=5)

# (1,3)
tag_menu_label = tk.Label(root, text="* 태그 선택", anchor='w')
tag_menu_label.grid(row=0, column=2, sticky='we', padx=10, pady=5)

# (1,4)
tag_variable = StringVar(root)
tag_variable.trace("w", update_user_input)
tag_options = {
    "신문 말뭉치": "document.paragraph.form",
    "일상 대화 말뭉치": "document.utterance.form",
    "직접 입력": "",
}
tag_menu = ttk.Combobox(root, textvariable=tag_variable, values=list(tag_options.keys()), state='readonly')
tag_menu.grid(row=0, column=3, sticky='ew', padx=10, pady=5)
tag_menu.set("직접 입력")

# (2,1)
ngram_cnt_label = tk.Label(root, text="* n-gram 사이즈", anchor='w')
ngram_cnt_label.grid(row=1, column=0, sticky='we', padx=10, pady=5)

# (2,2)
ngram_cnt_entry = ttk.Entry(root, validate="key", validatecommand=(vcmd, '%S'))
ngram_cnt_entry.grid(row=1, column=1, sticky='ew', padx=10, pady=5)
ngram_cnt_entry.insert(0, "1")

# (2,3)
tag_menu_label = tk.Label(root, text="* 형태소 분석기", anchor='w')
tag_menu_label.grid(row=1, column=2, sticky='we', padx=10, pady=5)

# (2,4)
morpheme_analyzer = StringVar(root)
morpheme_menu = ttk.Combobox(root, textvariable=morpheme_analyzer, values=list(morpheme_analyzers.keys()), state='readonly')
morpheme_menu.grid(row=1, column=3, sticky='ew', padx=10, pady=5)
morpheme_menu.set("선택 없음")

# (3,1)
graph_cnt_label = tk.Label(root, text="* 그래프 표출수(최대20)", anchor='w')
graph_cnt_label.grid(row=2, column=0, sticky='we', padx=10, pady=5)

# (3,2)
graph_cnt_entry = ttk.Entry(root, validate="key", validatecommand=(vcmd, '%S'))
graph_cnt_entry.grid(row=2, column=1, sticky='ew', padx=10, pady=5)
graph_cnt_entry.insert(0, "10")

# (3,3)
extract_cnt_label = tk.Label(root, text="표 표출수", anchor='w')
extract_cnt_label.grid(row=2, column=2, sticky='we', padx=10, pady=5)

# (3,4)
extract_cnt_entry = ttk.Entry(root, validate="key", validatecommand=(vcmd, '%S'))
extract_cnt_entry.grid(row=2, column=3, sticky='ew', padx=10, pady=5)

# (4,1)
filter_key_label = tk.Label(root, text="필터키", anchor='w')
filter_key_label.grid(row=3, column=0, sticky='we', padx=10, pady=5)

# (4,2)
filter_key_entry = ttk.Entry(root)
filter_key_entry.grid(row=3, column=1, sticky='ew', padx=10, pady=5)

# (4,3)
tag_menu_label = tk.Label(root, text="필터값", anchor='w')
tag_menu_label.grid(row=3, column=2, sticky='we', padx=10, pady=5)

# (4,4)
filter_value_entry = ttk.Entry(root)
filter_value_entry.grid(row=3, column=3, sticky='ew', padx=10, pady=5)

# (5,1)
concordance_label = tk.Label(root, text="콘코던스 단어(|로 구분)", anchor='w')
concordance_label.grid(row=4, column=0, sticky='we', padx=10, pady=5)

# (5,2)
concordance_entry = ttk.Entry(root)
concordance_entry.grid(row=4, column=1, sticky='ew', padx=10, pady=5)

# (5,3)
exclude_words_label = tk.Label(root, text="제외 단어(|로 구분)", anchor='w')
exclude_words_label.grid(row=4, column=2, sticky='we', padx=10, pady=5)

# (5,4)
exclude_words_entry = ttk.Entry(root)
exclude_words_entry.grid(row=4, column=3, sticky='ew', padx=10, pady=5)

# (6,1)
linguistic_unit_label = tk.Label(root, text="언어 단위 선택", anchor='w')
linguistic_unit_label.grid(row=5, column=0, sticky='we', padx=10, pady=5)

# (6,2)
linguistic_unit_variable = StringVar(root)
linguistic_unit_variable.trace("w", update_user_input)
linguistic_unit_options = {
    "형태소": "",
    "단위": "",
    "어절": ""}
linguistic_unit_menu = ttk.Combobox(root, textvariable=linguistic_unit_variable, values=list(linguistic_unit_options.keys()), state='readonly')
linguistic_unit_menu.grid(row=5, column=1, sticky='ew', padx=10, pady=5)
linguistic_unit_menu.set("직접 입력")

# (6,3)

window_label = tk.Label(root, text="window 사이즈", anchor='w')
window_label.grid(row=5, column=2, sticky='we', padx=10, pady=5)

# (6,4)
window_entry = ttk.Entry(root)
window_entry.grid(row=5, column=3, sticky='ew', padx=10, pady=5)

# (7,1)
folder_button = ttk.Button(root, text="폴더 선택 및 분석", command=analyze_folder)
folder_button.grid(row=6, column=0, sticky='ew', padx=10, pady=5, columnspan=2)

# (7,2)
reset_button = ttk.Button(root, text="결과창 리셋", command=reset_table)
reset_button.grid(row=6, column=2, sticky='ew', padx=10, pady=5, columnspan=2)

# (8,1)
progress_bar = ttk.Progressbar(root, orient="horizontal", length=200, mode="determinate")
progress_bar.grid(row=7, column=0, columnspan=4, sticky='we')

# (9,1)
graph_frame = ttk.Frame(root, height=500)  # height를 설정해 줍니다.
graph_frame.grid(row=8, column=0, columnspan=4, sticky='ew')  # sticky를 'ew'로 변경합니다.

root.grid_columnconfigure(1, weight=1)
root.grid_rowconfigure(6, weight=1)
root.grid_rowconfigure(7, weight=1)

root.mainloop()