### 1. data load

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
#  data load
df = pd.read_csv("./data/2019_3분기보고서_01_재무상태표_연결_20200617.txt", sep = "\t", encoding = "cp949")

### 2. 연속형변수 변경 dtype 변경

In [2]:
def str_to_float(data):
    import numpy as np
    
    data["당기 3분기말"] = data["당기 3분기말"].str.replace(",", "")
    data["전기말"] = data["전기말"].str.replace(",", "")
#     data["전전기말"] = data["전전기말"].str.replace(",", "")
    data["당기 3분기말"] = data["당기 3분기말"].astype(np.float32)
    return data

df = str_to_float(df)

### 3. 불필요한 변수 제거

In [3]:
# 필요 없는 변수 제거
def delete_col(data):
#     data.drop("통화", axis = 1, inplace = True)
#     data.drop("Unnamed: 15", axis = 1, inplace = True)
    data.drop("전기말", axis = 1, inplace = True)
    data.drop("전전기말", axis = 1, inplace = True)
    data.drop("재무제표종류", axis = 1, inplace = True)
    return data

df = delete_col(df)

### 4. 항목명 변수 처리

In [4]:
def duplication_check(data):
    # 항목코드 중복 확인
    entity_코드 = []
    entity_명 = []
    without_entity_코드 = []
    without_entity_명 = []

    # 각 리스트에 코드 및 코드명 append(소문자 처리)
    for idx in range(len(data)):
        if "entity" in data["항목코드"][idx]:
            entity_코드.append(data["항목코드"][idx].lower())
            entity_명.append(data["항목명"][idx].lower())
        else:
            without_entity_코드.append(data["항목코드"][idx].lower())
            without_entity_명.append(data["항목명"][idx].lower())

    # !entity 코드 딕셔너리
    dic_without_entity = {re.findall("_\w{1,}", without_entity_코드[idx])[0].lower():[] for idx in range(len(without_entity_코드))}

    for i, j in zip(without_entity_코드, without_entity_명):
        dic_without_entity[re.findall("_\w{1,}", i)[0].lower()].append(j)

    check_without_entity = {i:len(set(j)) for i, j in dic_without_entity.items()}
    
    # entity 코드 딕셔너리
    dic_entity = {entity_코드[idx]:[] for idx in range(len(entity_코드))}
    
    for i, j in zip(entity_코드, entity_명):
        dic_entity[i].append(j)
        
    check_entity = {i:len(set(j)) for i, j in dic_entity.items()}
    
    return entity_코드, entity_명, without_entity_코드, without_entity_명, dic_without_entity, dic_entity

entity_코드, entity_명, without_entity_코드, without_entity_명, dic_without_entity, dic_entity = duplication_check(df)

### 5. 공시된 항목명, 항목코드 가져오기(금융감독원 재무제표 양식)

In [5]:
def extract_element_id(data):
    element_idx_lst_entity = []
    element_idx_lst_without_entity = []
    
    for idx in range(len(data)):
        if "entity" in data["항목코드"][idx]:
            element_idx_lst_entity.append(data["항목코드"][idx])
        else:
            element_idx_lst_without_entity.append(data["항목코드"][idx])
            
    entity = sorted(set(element_idx_lst_entity), reverse = True)
    without_entity = sorted(set(element_idx_lst_without_entity), reverse = True)
    
    return entity, without_entity

a, b = extract_element_id(df)

elementid = pd.read_excel("./data/재무제표양식.xlsx", encoding = "utf-8", sheet_name = "BS1")
# '한글 Label', 'Element ID'
df_element = elementid[['한글 Label', 'Element ID']]
df_element = df_element[df_element["Element ID"].notnull()]

df_element.columns = df_element.columns.str.replace(" ", "_")
lst_element = df_element["Element_ID"].unique()

re_lst_element = []
for i in lst_element:
    a = re.findall("_\w{1,}", i)[0].lower()
    re_lst_element.append(a)
 
dic_element = {i:[] for i in re_lst_element}
for i, j in zip(re_lst_element, df_element["한글_Label"]):
    dic_element[i].append(j)

### 5-1. 항목명 entity 값 처리

In [6]:
# 항목명 전치리

# 1) _가진 항목명 추출
lst_ = []
for i in range(len(df)):
    if "_" in df.loc[i, "항목명"]:
        lst_.append(i)
        
def preprocessing(x):
    a = x.replace(" ", "")
    a = a.replace("_", "")
    a = a.replace("[", "")
    a = a.replace("]", "")
    a = a.replace("(", "")
    a = a.replace(")", "")
    a = a.replace(".", "")
    a = re.sub("[ⅠⅡⅢⅣⅤIII]", "", a)
    a = re.sub("[1-9]", "", a)
    
    
    return a

df["항목명"] = df["항목명"].agg(preprocessing)

lst_idx = []
for i in range(len(df)):
    if "총계" in df["항목명"][i] and "entity" in df["항목코드"][i]:
        lst_idx.append(i)
        
for i in lst_idx:
    if df["항목명"][i] == '부채총계':
        df["항목코드"][i] = "ifrs-full_Liabilities"
        
    elif df["항목명"][i] == '자본과부채의총계':
        df["항목코드"][i] = "ifrs-full_EquityAndLiabilities"
        
    elif df["항목명"][i] == '자산총계':
        df["항목코드"][i] = "ifrs-full_Assets"
        
    else:
        df["항목코드"][i] = "ifrs-full_Equity"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### 6. entity 제외 항목명 변수 전처리

In [7]:
lst_idx = []
for idx in range(len(df)):
    try:
        if "entity" not in df["항목코드"][idx]:
            df["항목명"][idx] = dic_element[re.findall("_\w{1,}", df["항목코드"][idx])[0].lower()][0]
    except Exception as e:
        lst_idx.append(idx)
        
        
elementid = pd.read_excel("./data/재무제표양식.xlsx", encoding = "utf-8", sheet_name = "BS1")
# '한글 Label', 'Element ID'
df_element = elementid[['한글 Label', 'Element ID']]
df_element = df_element[df_element["Element ID"].notnull()]

df_element.columns = df_element.columns.str.replace(" ", "_")
lst_element = df_element["Element_ID"].unique()

re_lst_element = []
for i in lst_element:
    a = re.findall("_\w{1,}", i)[0].lower()
    re_lst_element.append(a)
 
dic_elementid = {(((i.replace("_", "")).replace(" ", "")).replace("[", "")).replace("]", ""):j 
                 for i, j in zip(df_element["한글_Label"], df_element["Element_ID"])}

idx_entity = []
for idx in range(len(df)):
    if "entity" in df.loc[idx, "항목코드"]:
        idx_entity.append(idx)

# entity 포함 항목코드 항목코드 전처리
no_preprocessing_entity_idx = []
for idx in idx_entity:
    try:
        df.loc[idx, "항목코드"] = dic_elementid[df.loc[idx, "항목명"]]

    except Exception as e:
        no_preprocessing_entity_idx.append(idx)
        
# 한번더 전처리
for idx in range(len(df)):
    if "entity" not in df["항목코드"][idx]:
        df["항목명"][idx] = dic_element[re.findall("_\w{1,}", df["항목코드"][idx])[0].lower()][0]
        

# 항목명이 같은 entity 변수 전처리
lst = []
for idx in range(len(df)):
    if "entity" in df.loc[idx, "항목코드"]:
        lst.append(idx)
        
no_preprocessing = []
for idx in lst:
    try:
        df.loc[idx, "항목코드"] = dic_elementid[df.loc[idx, "항목명"]]
    except Exception as e:
        no_preprocessing.append(idx)
        
        
# 새 기준 양식 불러오기
new_criterion = pd.read_excel("./data/개선코드표.xlsx", encoding = "utf-8", sheet_name = "2")
new_criterion

dic_re_code = {re.findall("_\w{1,}",i)[0]:j for i,j in zip(new_criterion["항목코드"], new_criterion["개선항목코드"])}
dic_re_name = {re.findall("_\w{1,}",i)[0]:j for i,j in zip(new_criterion["항목코드"], new_criterion["개선항목명"])}

# 새기준 양식으로 항목명 
for idx in range(len(df)):
    if "entity" not in df.loc[idx, "항목코드"]:
        df.loc[idx, "항목명"] = dic_re_name[re.findall("_\w{1,}", df.loc[idx, "항목코드"])[0]]
        df.loc[idx, "항목코드"] = dic_re_code[re.findall("_\w{1,}", df.loc[idx, "항목코드"])[0]]
        
# entity 항목코드 전처리
entity_re_name = {i:j for i,j in zip(new_criterion["개선항목코드"], new_criterion["개선항목명"])}

new_entity = pd.read_excel("./data/19.1~3Q.entity 추출.xlsx", encoding = "utf-8", sheet_name="19.3Q.BS.entity.추출대상")

pre_entity = {i:j for i, j in zip(new_entity["항목코드"], new_entity["적정코드"])}

can_idx = []
er =[]
for idx in no_preprocessing:
    try:
        if pre_entity[df.loc[idx, "항목코드"]]:
            can_idx.append(idx)
    except Exception as e:
        er.append(idx)
        
for idx in can_idx:
    
    df.loc[idx, "항목코드"] = pre_entity[df.loc[idx, "항목코드"]]        
    df.loc[idx,"항목명"] = entity_re_name[df.loc[idx, "항목코드"]]
        
        
# 필요 없는 변수 제거
index = df[df["항목명"] == "지우기"].index
df.drop(index, inplace = True)
df.reset_index(drop = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### 7. 정제된 데이터 프레임 만들기

In [8]:
# DB 테이블에 적용할 D.F
# 회사이름 추출
lst_company = df["회사명"].unique()
element_value = list(new_criterion[new_criterion["개선항목명"] != "지우기"]["개선항목명"].unique())
    
# columns
col = list(df.columns[0:9]) + element_value
check_col = list(dic_element.keys())

#  해당년도 데이터 행길이
company_dic = {i:[] for i in lst_company}
day = []

for i in lst_company:
    company_dic[i].append(df[df["회사명"] == i]["결산기준일"].unique())
    day.append(len(df[df["회사명"] == i]["결산기준일"].unique()))
    
row = sum(day)

bon = np.zeros([row,len(col)]) + np.NAN

# D.F
last_df = pd.DataFrame(bon, columns = col)

# 합쳐야 할 변수 
sum_lst = list(new_criterion["개선항목명"].value_counts().head(10)[1:9].index)
last_df[sum_lst] = 0

### 8. 데이터 입력

In [9]:
dic_standard = {i:list(df[df["회사명"] == i]["결산기준일"].unique()) for i in lst_company}
last_company = []
last_standard  = []
for key, value in dic_standard.items():
    for index in range(len(value)):
        last_company.append(key)
        last_standard.append(value[index])
            
    
cnt = 0

    
for idx in range(len(last_company)):

    a = df[(df["회사명"] == last_company[idx]) & (df["결산기준일"] == last_standard[idx])]
    a.reset_index(drop = True, inplace = True)
    for i in range(9):
        last_df.iloc[cnt, i] = a.iloc[0, i]

    for idx3 in range(len(a)):
        if "entity" not in a["항목코드"][idx3]:
            loc = a["항목명"][idx3]
            if loc in sum_lst:
                if np.isnan(a["당기 3분기말"][idx3]) == False:
                    last_df.loc[cnt, loc] += a["당기 3분기말"][idx3]
                else:
                    if last_df.loc[cnt, loc] == 0:
                        last_df.loc[cnt, loc] = a["당기 3분기말"][idx3]
            else:
                last_df.loc[cnt, loc] = a["당기 3분기말"][idx3]
    cnt += 1    

In [10]:
df2019 = pd.read_csv("./data/분기_반기.csv", encoding = "euc-kr")

In [11]:
col2019 = df2019.columns
col = last_df.columns


last_df.columns = col2019

In [12]:
real_df = pd.concat([df2019, last_df]).reset_index(drop = True)

In [13]:
real_df.to_csv("./data/분기_반기.csv", encoding = "euc=kr", index = False)