## 데이터 저장

In [1]:
import pandas as pd
import pickle
import os

In [2]:
def load_and_save_csv_files(directory):
    # 주어진 디렉토리에서 모든 CSV 파일을 찾음
    for filename in os.listdir(directory):
        if filename.endswith('_Q.csv') or filename.endswith('_R.csv'):
            # 파일 경로 생성
            file_path = os.path.join(directory, filename)
            # CSV 파일 읽기
            df = pd.read_csv(file_path, encoding='utf-8')
            # pickle 파일 이름 생성 (CSV 확장자를 PKL로 변경)
            pkl_filename = filename.replace('.csv', '.pkl')
            pkl_path = os.path.join(directory, pkl_filename)
            # 데이터를 pickle 파일로 저장
            save_data_to_file(df, pkl_path)

def save_data_to_file(data, filename):
    # 데이터를 파일로 저장
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

# '카테고리별' 폴더 내의 모든 파일 처리
load_and_save_csv_files('카테고리별')

## 데이터 불러오기

In [3]:
# .pkl 파일에서 데이터 로드
def load_data_from_file(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

# 주어진 디렉토리에서 _Q.pkl로 끝나는 파일만 로드하는 함수
def load_q_pkl_files(directory):
    data_dict = {}
    for filename in os.listdir(directory):
        if filename.endswith('_Q.pkl'):
            file_path = os.path.join(directory, filename)
            data = load_data_from_file(file_path)
            data_dict[file_path] = data
    return data_dict


In [4]:
# 주어진 디렉토리에서 _R.pkl로 끝나는 파일만 로드하는 함수
def load_r_pkl_files(directory):
    data_dict = {}
    for filename in os.listdir(directory):
        if filename.endswith('_R.pkl'):
            file_path = os.path.join(directory, filename)
            data = load_data_from_file(file_path)
            data_dict[file_path] = data
    return data_dict


## 불필요한 문자 제거

In [7]:
import re

def remove_text(texts_dict):
    processed_texts = {}
    for file_path, df in texts_dict.items():
        if isinstance(df, pd.DataFrame) and '답변' in df.columns:
            df['답변'] = df['답변'].apply(lambda x: re.sub(r'[a-z]/\s', ' ', x))
            df['답변'] = df['답변'].apply(lambda x: re.sub(r'[^\w\s?.!가-힣]', ' ', x))
            processed_texts[file_path] = df
        else:
            print(f"Error: Data in {file_path} is not in expected format or 'text' column is missing.")
    return processed_texts

In [8]:
all_R_data = load_r_pkl_files('카테고리별')
processed_texts_R = remove_text(all_R_data)

In [9]:
def remove_text(texts_dict):
    processed_texts = {}
    for file_path, df in texts_dict.items():
        if isinstance(df, pd.DataFrame) and '질문' in df.columns:
            df['질문'] = df['질문'].fillna('')
            df['질문'] = df['질문'].apply(lambda x: re.sub(r'[a-z]/\s', ' ', x))
            df['질문'] = df['질문'].apply(lambda x: re.sub(r'[^\w\s?.!가-힣]', ' ', x))
            processed_texts[file_path] = df
        else:
            print(f"Error: Data in {file_path} is not in expected format or 'text' column is missing.")
    return processed_texts

In [10]:
all_Q_data = load_q_pkl_files('카테고리별')
processed_texts_Q = remove_text(all_Q_data)

## 띄어쓰기 교정

In [12]:
from pykospacing import Spacing

def correct_spacing(texts_dict, column_name):
    spacing = Spacing()
    corrected_texts = {}
    for file_path, df in texts_dict.items():
        if isinstance(df, pd.DataFrame) and column_name in df.columns:  # 데이터 타입과 열 존재 확인
            # 지정된 열에 띄어쓰기 교정 적용
            df[column_name] = df[column_name].apply(lambda text: spacing(text) if isinstance(text, str) else text)
            corrected_texts[file_path] = df
        else:
            # 데이터가 DataFrame이 아니거나 지정된 열이 없는 경우 에러 메시지 출력
            print(f"Error: Data in {file_path} does not have the '{column_name}' column.")
    return corrected_texts

# 질문 데이터를 '질문' 열에 대해 띄어쓰기 교정 적용
corrected_texts_Q = correct_spacing(processed_texts_Q, '질문')
# 답변 데이터를 '답변' 열에 대해 띄어쓰기 교정 적용
corrected_texts_R = correct_spacing(processed_texts_R, '답변')






In [50]:
pip install git+https://github.com/haven-jeon/PyKoSpacing.git

Collecting git+https://github.com/haven-jeon/PyKoSpacing.git
  Cloning https://github.com/haven-jeon/PyKoSpacing.git to c:\users\sugye\appdata\local\temp\pip-req-build-jhsl0yp1
  Resolved https://github.com/haven-jeon/PyKoSpacing.git to commit 103ff614a2edf6df87d289ccaf99822b9afa9a42
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tensorflow==2.15.1 (from pykospacing==0.5)
  Downloading tensorflow-2.15.1-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting argparse>=1.4.0 (from pykospacing==0.5)
  Downloading argparse-1.4.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting tensorflow-intel==2.15.1 (from tensorflow==2.15.1->pykospacing==0.5)
  Downloading tensorflow_intel-2.15.1-cp311-cp311-win_amd64.whl.metadata (4.9 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow-intel==2.15.1->tensorflow==2.15.1->pykospacing==0.5)
  Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)
Collecting tensorflow-estimat

  Running command git clone --filter=blob:none --quiet https://github.com/haven-jeon/PyKoSpacing.git 'C:\Users\sugye\AppData\Local\Temp\pip-req-build-jhsl0yp1'


## 맞춤법 교정

In [61]:
import pickle
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# 모델과 토크나이저 로드
model = T5ForConditionalGeneration.from_pretrained("j5ng/et5-typos-corrector")
tokenizer = T5Tokenizer.from_pretrained("j5ng/et5-typos-corrector")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

ModuleNotFoundError: No module named 'torch'