# 1. 라이브러리 설치

In [3]:
!pip install azure-storage-blob python-dotenv



# 2. import

In [4]:
import os
import json
import concurrent.futures
from datetime import datetime, timedelta
from collections import defaultdict
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

# 3. env 로드

In [5]:
load_dotenv()  # .env 파일 읽기
connection_string = os.getenv("AZURE_BLOB_CONN_STR")
# 연결 문자열 확인 (디버깅용)
if not connection_string:
    raise ValueError("AZURE_BLOB_CONN_STR가 설정되지 않았습니다. .env 파일 확인하세요.")

# 4. BatchDataProcessor 클래스 정의

In [6]:
class BatchDataProcessor:
    def __init__(self, connection_string, container_name="processed-data"):
        self.connection_string = connection_string
        self.container_name = container_name
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        self.day_mapping = {0:"monday",1:"tuesday",2:"wednesday",3:"thursday",4:"friday",5:"saturday",6:"sunday"}

    def setup_container(self):
        try:
            container_client = self.blob_service_client.get_container_client(self.container_name)
            container_client.create_container()
            print(f"Container '{self.container_name}' created successfully.")
        except Exception as e:
            if "ContainerAlreadyExists" in str(e):
                print(f"Container '{self.container_name}' already exists.")
            else:
                print(f"Error creating container: {e}")

    def parse_filename(self, filename):
        name_without_ext = filename.replace('.json', '')
        parts = name_without_ext.split('_')
        if len(parts) == 3:
            return {'learner_id': parts[0], 'test_id': parts[1], 'item_id': parts[2]}
        return None

    def load_json_file(self, file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            return None

    def map_timestamp_to_current(self, original_timestamp):
        try:
            original_dt = datetime.strptime(original_timestamp, "%Y-%m-%d %H:%M:%S")
            current_date = datetime.now().date()
            mapped_dt = datetime.combine(current_date, original_dt.time())
            original_weekday = original_dt.weekday()
            current_weekday = datetime.now().weekday()
            day_diff = original_weekday - current_weekday
            if day_diff != 0:
                mapped_dt += timedelta(days=day_diff)
            return mapped_dt.strftime("%Y-%m-%d %H:%M:%S")
        except Exception as e:
            print(f"Error mapping timestamp {original_timestamp}: {e}")
            return original_timestamp

    def group_responses_into_batches(self, responses):
        batches = defaultdict(lambda: defaultdict(list))
        for response in responses:
            test_id = response.get('testID')
            timestamp = response.get('Timestamp')
            if not test_id or not timestamp:
                continue
            try:
                dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
                hour_key = dt.strftime("%Y-%m-%d_%H")
                batches[test_id][hour_key].append(response)
            except Exception as e:
                print(f"Error processing timestamp {timestamp}: {e}")
        return batches

    def create_batch_data(self, test_id, hour_key, responses):
        if not responses:
            return None
        latest_response = max(responses, key=lambda x: x.get('Timestamp', ''))
        latest_timestamp = latest_response.get('Timestamp')
        if not latest_timestamp:
            return None
        try:
            latest_dt = datetime.strptime(latest_timestamp, "%Y-%m-%d %H:%M:%S")
            mapped_timestamp = self.map_timestamp_to_current(latest_timestamp)
            mapped_dt = datetime.strptime(mapped_timestamp, "%Y-%m-%d %H:%M:%S")
            day_of_week = self.day_mapping[latest_dt.weekday()]
            batch_data = {
                "batchId": f"{test_id}_{latest_dt.strftime('%Y%m%d%H%M%S')}",
                "testId": test_id,
                "latestTimestamp": latest_timestamp,
                "mappedTimestamp": mapped_timestamp,
                "dayOfWeek": day_of_week,
                "hour": latest_dt.hour,
                "itemCount": len(responses),
                "responses": responses
            }
            return batch_data, day_of_week, latest_dt.hour
        except Exception as e:
            print(f"Error creating batch data: {e}")
            return None

    def process_grade_folder(self, grade_path):
        grade_name = os.path.basename(grade_path)
        print(f"Processing {grade_name}...")
        all_responses = []
        for root, dirs, files in os.walk(grade_path):
            if "문항정오답표" in root:
                for filename in files:
                    if filename.endswith('.json'):
                        file_path = os.path.join(root, filename)
                        file_info = self.parse_filename(filename)
                        if file_info:
                            data = self.load_json_file(file_path)
                            if data:
                                all_responses.append(data)
        print(f"{grade_name}: Loaded {len(all_responses)} responses")
        batches = self.group_responses_into_batches(all_responses)
        uploaded_count = 0
        for test_id, hour_groups in batches.items():
            for hour_key, responses in hour_groups.items():
                if len(responses) >= 6:
                    batch_info = self.create_batch_data(test_id, hour_key, responses[:6])
                    if batch_info:
                        batch_data, day_of_week, hour = batch_info
                        blob_name = f"{grade_name}/batches/{day_of_week}/hour-{hour:02d}/{batch_data['batchId']}.json"
                        success = self.upload_to_blob(blob_name, batch_data)
                        if success:
                            uploaded_count += 1
        print(f"{grade_name}: Uploaded {uploaded_count} batches")
        return uploaded_count

    def upload_to_blob(self, blob_name, data):
        try:
            blob_client = self.blob_service_client.get_blob_client(container=self.container_name, blob=blob_name)
            json_data = json.dumps(data, ensure_ascii=False, indent=2)
            blob_client.upload_blob(json_data, overwrite=True)
            print(f"Uploaded: {blob_name}")
            return True
        except Exception as e:
            print(f"Error uploading {blob_name}: {e}")
            return False

    def process_all_grades_parallel(self, data_root_path):
        grade_folders = [os.path.join(data_root_path, item) for item in os.listdir(data_root_path)
                         if os.path.isdir(os.path.join(data_root_path, item)) and "학년" in item]
        print(f"Found grade folders: {[os.path.basename(f) for f in grade_folders]}")
        self.setup_container()
        total_uploaded = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
            future_to_grade = {executor.submit(self.process_grade_folder, grade_path): grade_path
                               for grade_path in grade_folders}
            for future in concurrent.futures.as_completed(future_to_grade):
                grade_path = future_to_grade[future]
                try:
                    uploaded_count = future.result()
                    total_uploaded += uploaded_count
                    print(f"Completed: {os.path.basename(grade_path)} - {uploaded_count} batches")
                except Exception as e:
                    print(f"Error processing {grade_path}: {e}")
        print(f"Total uploaded batches: {total_uploaded}")
        return total_uploaded

# 5. 실행

In [7]:
data_root_path = "./data"  # 데이터 루트 경로
processor = BatchDataProcessor(connection_string)
processor.process_all_grades_parallel(data_root_path)

Found grade folders: ['7학년', '8학년', '9학년']
Container 'processed-data' created successfully.
Processing 7학년...
Processing 8학년...
Processing 9학년...
9학년: Loaded 20282 responses
Uploaded: 9학년/batches/wednesday/hour-00/A090000001_20200304004546.json
Uploaded: 9학년/batches/tuesday/hour-04/A090000001_20200303042701.json
Uploaded: 9학년/batches/wednesday/hour-03/A090000001_20200304035252.json
Uploaded: 9학년/batches/friday/hour-07/A090000001_20200522073949.json
Uploaded: 9학년/batches/thursday/hour-02/A090000001_20200305021818.json
Uploaded: 9학년/batches/thursday/hour-03/A090000001_20200416033835.json
Uploaded: 9학년/batches/thursday/hour-08/A090000001_20201217082014.json
Uploaded: 9학년/batches/wednesday/hour-01/A090000001_20200304014623.json
Uploaded: 9학년/batches/saturday/hour-07/A090000001_20200229073549.json
Uploaded: 9학년/batches/wednesday/hour-20/A090000001_20200205202337.json
Uploaded: 9학년/batches/saturday/hour-17/A090000001_20200229172436.json
Uploaded: 9학년/batches/friday/hour-08/A090000001_2020060

12118