In [13]:
import os
import dataclasses
import datetime

from pprint import PrettyPrinter

pprint = PrettyPrinter(indent=4).pprint

@dataclasses.dataclass
class TextMessage :
    title : str      = None
    label : str      = None
    content : str    = None
    time1 : datetime = None
    time2 : datetime = None

    def __init__(self, input_txt_lines) :
        self.parseRawInput(input_txt_lines)

    def parseRawInput(self, input_txt_lines) :
        time_str1, time_str2, self.label, self.content = input_txt_lines[0].split("[(KISA)]")
        self.time1 = datetime.datetime.strptime(time_str1.removeprefix("[(KISA:SOL)]"), "%Y%m%d%H%M")
        self.time2 = datetime.datetime.strptime(time_str2, "%Y%m%d%H%M")
        for line in input_txt_lines[1:-1] :
            self.content += line
        self.title = input_txt_lines[-1].removesuffix("[(KISA:EOL)]\n")


def parseDataFile(file_path) :
    with open(file_path, "r") as fp :
        data_raw = fp.readlines()[1:]

    SOL_IDX_LIST = list(filter(
        lambda x : False if x is False else True,
        list(map(
            lambda line_idx, string : line_idx if "[(KISA:SOL)]" in string else False,
            range(len(data_raw)),
            data_raw
        ))
    ))
    EOL_IDX_LIST = list(filter(
        lambda x : False if x is False else True,
        list(map(
            lambda line_idx, string : line_idx if "[(KISA:EOL)]" in string else False,
            range(len(data_raw)),
            data_raw
        ))
    ))
    assert len(SOL_IDX_LIST) == len(EOL_IDX_LIST), """
        number of SOL lines and EOL lines are different.
        check if data is valid
    """
    text_message_list = []
    for text_data_start_line_idx, text_data_end_line_idx in zip(SOL_IDX_LIST, EOL_IDX_LIST) :
        text_message_list.append(
            TextMessage(input_txt_lines = data_raw[
                text_data_start_line_idx:text_data_end_line_idx + 1
            ])
        )
    return text_message_list

In [14]:
# --------------------
DATA_FILE_IDX = 0
# FILE_PATH = "./20220911_GBL.csv"

# --------------------

DATA_FILE_NAMES = sorted(list(filter(
    lambda name : "csv" in name and "._" not in name,
    os.listdir("./")
)))

file_path = DATA_FILE_NAMES[DATA_FILE_IDX]

print(file_path)

text_message_list = parseDataFile(file_path)

20220911_GBL.csv


In [15]:
ll = []
for fn in DATA_FILE_NAMES :
    ll += parseDataFile(fn)

pprint(ll[:10])

[   TextMessage(title='SMS/-', label='도박', content='비행기  신규첫 3+2 10+5 20+7 30+10 50+15 100+30  *bit.ly/3AZST3y 쿄드쟈동\n', time1=datetime.datetime(2022, 8, 31, 2, 44), time2=datetime.datetime(2022, 8, 31, 0, 4)),
    TextMessage(title='SMS/-', label='도박', content='[국제발신]비행기  신규첫 3+2 10+5 20+7 30+10 50+15 100+30  *bit.ly/3AZST3y 쿄드쟈동\n', time1=datetime.datetime(2022, 8, 31, 6, 28), time2=datetime.datetime(2022, 8, 31, 0, 9)),
    TextMessage(title='MMS/null', label='성인', content='[Web발신](광고)(주)서울파이낸셜아직도 망설이시나요?아직도 제자리이신가요?악재로 인한 흔들리는 시장 !  서울중앙점은 확실한 분석을 통하여 이겨내고 성공을 안겨드립니다. 성공이 눈앞에 보여지는지관망만 해주셔도 좋습니다. 급하게 진행 안하셔도 좋습니다. ▼ 혜택받기 ▼bit.ly/SEOULMIDDLES금주 현황22일 : 2,270,000원23일 : 2,107,500원24일 : 2,110,000원25일 : 2,335,000원26일 : 2,047,500원27일 : 2,120,500원28일 : 진행 중▼ 혜택받기 ▼bit.ly/SEOULMIDDLES무료거부 0808701121\n\n', time1=datetime.datetime(2022, 8, 31, 8, 24), time2=datetime.datetime(2022, 8, 31, 8, 24)),
    TextMessage(title='MMS/보이스캐디 캐디복지몰', label='불법대출', content="[Web발신]보이스캐디 APL™ 서비스 지원 골프장 250개 

In [27]:
label_list = list(map(
    lambda x : x.label,
    ll
))

#label_dict = dict.fromkeys(list(set(label_list)), 0)
label_dict = {key : label_list.count(key) for key in list(set(label_list))}
label_dict

{'비스팸': 527,
 '부동산': 78,
 '게임': 150,
 '성인': 1179,
 '운세': 8,
 '대리운전': 263,
 '통신가입': 292,
 '기타': 16969,
 '금융': 10,
 '유흥업소': 86,
 '주식': 84,
 '계좌임대': 1,
 '의약품': 302,
 '스미싱': 5,
 '불법대출': 1042,
 '도박': 18312,
 '확인불가': 23}