### 지식인 파일 분리 및 저장

In [19]:
import os
import pandas as pd

def read_csv_files(directory):
    data_frames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path, encoding='utf-8')
                data_frames.append((df, file))
    return data_frames

def split_and_save_csv(df, category, save_directory):
    
    # 필요한 열이 존재하는지 확인
    if all(column in df.columns for column in ['카테고리', '키워드', '질문', '답변']):
        # 질문과 답변 분리
        df_q = df[['카테고리', '키워드', '질문']]
        df_r = df[['카테고리', '키워드', '답변']]
        
        # 저장할 파일 경로 생성
        q_file_path = os.path.join(save_directory, f"{category}_Q.csv")
        r_file_path = os.path.join(save_directory, f"{category}_R.csv")
        
        # 분리된 DataFrame 저장
        df_q.to_csv(q_file_path, index=False, encoding='utf-8')
        df_r.to_csv(r_file_path, index=False, encoding='utf-8')
        
        print(f"Saved {q_file_path} and {r_file_path}")
    else:
        print(f"Skipping {category}: Required columns are not present")

def process_all_csv_files(directory):
    data_frames = read_csv_files(directory)
    for df, file in data_frames:
        category = os.path.splitext(file)[0]
        split_and_save_csv(df, category, directory)

directory = '카테고리별'  # 카테고리별 파일들이 저장된 디렉토리
process_all_csv_files(directory)

Saved 카테고리별\TV_Q.csv and 카테고리별\TV_R.csv
Skipping TV_Q: Required columns are not present
Skipping TV_R: Required columns are not present
Saved 카테고리별\날씨_Q.csv and 카테고리별\날씨_R.csv
Skipping 날씨_Q: Required columns are not present
Skipping 날씨_R: Required columns are not present
Saved 카테고리별\쇼핑_Q.csv and 카테고리별\쇼핑_R.csv
Skipping 쇼핑_Q: Required columns are not present
Skipping 쇼핑_R: Required columns are not present
Saved 카테고리별\안부 일상 대화_Q.csv and 카테고리별\안부 일상 대화_R.csv
Skipping 안부 일상 대화_Q: Required columns are not present
Skipping 안부 일상 대화_R: Required columns are not present
Saved 카테고리별\정치 경제_Q.csv and 카테고리별\정치 경제_R.csv
Skipping 정치 경제_Q: Required columns are not present
Skipping 정치 경제_R: Required columns are not present
Saved 카테고리별\취미_Q.csv and 카테고리별\취미_R.csv
Skipping 취미_Q: Required columns are not present
Skipping 취미_R: Required columns are not present


### '답변' 칼럼 nan 값 삭제

In [20]:
def remove_nan_from_r_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('_R.csv'):
                file_path = os.path.join(root, file)
                
                # CSV 파일 읽기
                df = pd.read_csv(file_path, encoding='utf-8')
                
                # '답변' 열에 NaN 값이 있는 행 제거
                df_cleaned = df.dropna(subset=['답변'])
                
                # 원래 파일에 덮어쓰기
                df_cleaned.to_csv(file_path, index=False, encoding='utf-8')
                
                print(f"Processed {file_path}")

directory = '카테고리별'  # _R.csv 파일들이 저장된 디렉토리
remove_nan_from_r_files(directory)


Processed 카테고리별\TV_R.csv
Processed 카테고리별\날씨_R.csv
Processed 카테고리별\쇼핑_R.csv
Processed 카테고리별\안부 일상 대화_R.csv
Processed 카테고리별\정치 경제_R.csv
Processed 카테고리별\취미_R.csv


In [22]:
null_answers = data[data['답변'].isnull()]
null_answers

Unnamed: 0,카테고리,키워드,답변


In [23]:
data.shape

(15673, 3)