### 1.data load

In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

#  data load
df2019 = pd.read_csv("../data/2019_사업보고서_01_재무상태표_연결_20200623.txt", sep = "\t", encoding = "cp949")

### 2. 불필요한 변수 제거

In [142]:
# 필요 없는 변수 제거
def delete_col(data):
#     data.drop("통화", axis = 1, inplace = True)
    data.drop("Unnamed: 15", axis = 1, inplace = True)
    data.drop("전기", axis = 1, inplace = True)
    data.drop("전전기", axis = 1, inplace = True)
    data.drop("재무제표종류", axis = 1, inplace = True)
    return data

df2019 = delete_col(df2019)

### 3. 연속형변수(당기, 전기, 전전기)  dtype 변경

In [143]:
def str_to_float(data):
    import numpy as np
    
    data["당기"] = data["당기"].str.replace(",", "")
#     data["전기"] = data["전기"].str.replace(",", "")
#     data["전전기"] = data["전전기"].str.replace(",", "")
    data["당기"] = data["당기"].astype(np.float32)
    return data

df2019 = str_to_float(df2019)

### 4. 항목명 변수 처리

In [144]:
def duplication_check(data):
    # 항목코드 중복 확인
    entity_코드 = []
    entity_명 = []
    without_entity_코드 = []
    without_entity_명 = []

    # 각 리스트에 코드 및 코드명 append(소문자 처리)
    for idx in range(len(data)):
        if "entity" in data["항목코드"][idx]:
            entity_코드.append(data["항목코드"][idx].lower())
            entity_명.append(data["항목명"][idx].lower())
        else:
            without_entity_코드.append(data["항목코드"][idx].lower())
            without_entity_명.append(data["항목명"][idx].lower())

    # !entity 코드 딕셔너리
    dic_without_entity = {re.findall("_\w{1,}", without_entity_코드[idx])[0].lower():[] for idx in range(len(without_entity_코드))}

    for i, j in zip(without_entity_코드, without_entity_명):
        dic_without_entity[re.findall("_\w{1,}", i)[0].lower()].append(j)

    check_without_entity = {i:len(set(j)) for i, j in dic_without_entity.items()}
    
    # entity 코드 딕셔너리
    dic_entity = {entity_코드[idx]:[] for idx in range(len(entity_코드))}
    
    for i, j in zip(entity_코드, entity_명):
        dic_entity[i].append(j)
        
    check_entity = {i:len(set(j)) for i, j in dic_entity.items()}
    
    return entity_코드, entity_명, without_entity_코드, without_entity_명, dic_without_entity, dic_entity

entity_코드, entity_명, without_entity_코드, without_entity_명, dic_without_entity, dic_entity = duplication_check(df2019)

### 5. 공시된 항목명, 항목코드 가져오기(금융감독원 재무제표 양식)

In [145]:
def extract_element_id(data):
    element_idx_lst_entity = []
    element_idx_lst_without_entity = []
    
    for idx in range(len(data)):
        if "entity" in data["항목코드"][idx]:
            element_idx_lst_entity.append(data["항목코드"][idx])
        else:
            element_idx_lst_without_entity.append(data["항목코드"][idx])
            
    entity = sorted(set(element_idx_lst_entity), reverse = True)
    without_entity = sorted(set(element_idx_lst_without_entity), reverse = True)
    
    return entity, without_entity

a, b = extract_element_id(df2019)
pd.Series(b).to_csv("../2019항목명.csv", encoding = "euc-kr", index = False)

elementid = pd.read_excel("../data/재무제표양식.xlsx", encoding = "utf-8", sheet_name = "BS1")
# '한글 Label', 'Element ID'
df_element = elementid[['한글 Label', 'Element ID']]
df_element = df_element[df_element["Element ID"].notnull()]

df_element.columns = df_element.columns.str.replace(" ", "_")
lst_element = df_element["Element_ID"].unique()

re_lst_element = []
for i in lst_element:
    a = re.findall("_\w{1,}", i)[0].lower()
    re_lst_element.append(a)
 
dic_element = {i:[] for i in re_lst_element}
for i, j in zip(re_lst_element, df_element["한글_Label"]):
    dic_element[i].append(j)

### 5-1. 항목명 entity 값 처리(ver_1)

In [146]:
# 항목명 전치리

# 1) _가진 항목명 추출
lst_ = []
for i in range(len(df2019)):
    if "_" in df2019.loc[i, "항목명"]:
        lst_.append(i)
        
def preprocessing(x):
    a = x.replace(" ", "")
    a = a.replace("_", "")
    a = a.replace("[", "")
    a = a.replace("]", "")
    a = a.replace("(", "")
    a = a.replace(")", "")
    a = a.replace(".", "")
    a = re.sub("[ⅠⅡⅢⅣⅤIII]", "", a)
    a = re.sub("[1-9]", "", a)
    
    
    return a

df2019["항목명"] = df2019["항목명"].agg(preprocessing)

lst_idx = []
for i in range(len(df2019)):
    if "총계" in df2019["항목명"][i] and "entity" in df2019["항목코드"][i]:
        lst_idx.append(i)
        
for i in lst_idx:
    if df2019["항목명"][i] == '부채총계':
        df2019["항목코드"][i] = "ifrs-full_Liabilities"
        
    elif df2019["항목명"][i] == '자본과부채의총계':
        df2019["항목코드"][i] = "ifrs-full_EquityAndLiabilities"
        
    elif df2019["항목명"][i] == '자산총계':
        df2019["항목코드"][i] = "ifrs-full_Assets"
        
    else:
        df2019["항목코드"][i] = "ifrs-full_Equity"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### 6. entity 제외 항목명 변수 전처리

In [147]:
for idx in range(len(df2019)):
    if "entity" not in df2019["항목코드"][idx]:
        df2019["항목명"][idx] = dic_element[re.findall("_\w{1,}", df2019["항목코드"][idx])[0].lower()][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### 6-1. entity 변수 전처리

In [148]:
elementid = pd.read_excel("../data/재무제표양식.xlsx", encoding = "utf-8", sheet_name = "BS1")
# '한글 Label', 'Element ID'
df_element = elementid[['한글 Label', 'Element ID']]
df_element = df_element[df_element["Element ID"].notnull()]

df_element.columns = df_element.columns.str.replace(" ", "_")
lst_element = df_element["Element_ID"].unique()

re_lst_element = []
for i in lst_element:
    a = re.findall("_\w{1,}", i)[0].lower()
    re_lst_element.append(a)
 
dic_elementid = {(((i.replace("_", "")).replace(" ", "")).replace("[", "")).replace("]", ""):j 
                 for i, j in zip(df_element["한글_Label"], df_element["Element_ID"])}

idx_entity = []
for idx in range(len(df2019)):
    if "entity" in df2019.loc[idx, "항목코드"]:
        idx_entity.append(idx)

# entity 포함 항목코드 항목코드 전처리
no_preprocessing_entity_idx = []
for idx in idx_entity:
    try:
        df2019.loc[idx, "항목코드"] = dic_elementid[df2019.loc[idx, "항목명"]]

    except Exception as e:
        no_preprocessing_entity_idx.append(idx)
        
# 한번더 전처리
for idx in range(len(df2019)):
    if "entity" not in df2019["항목코드"][idx]:
        df2019["항목명"][idx] = dic_element[re.findall("_\w{1,}", df2019["항목코드"][idx])[0].lower()][0]
        

# 항목명이 같은 entity 변수 전처리
lst = []
for idx in range(len(df2019)):
    if "entity" in df2019.loc[idx, "항목코드"]:
        lst.append(idx)
        
no_preprocessing = []
for idx in lst:
    try:
        df2019.loc[idx, "항목코드"] = dic_elementid[df2019.loc[idx, "항목명"]]
    except Exception as e:
        no_preprocessing.append(idx)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### 6-2. 새로운 기준 입히기

In [149]:
# 새 기준 양식 불러오기
new_criterion = pd.read_excel("../data/개선코드표.xlsx", encoding = "utf-8", sheet_name = "2")
new_criterion

dic_re_code = {re.findall("_\w{1,}",i)[0]:j for i,j in zip(new_criterion["항목코드"], new_criterion["개선항목코드"])}
dic_re_name = {re.findall("_\w{1,}",i)[0]:j for i,j in zip(new_criterion["항목코드"], new_criterion["개선항목명"])}

# 새기준 양식으로 항목명 
for idx in range(len(df2019)):
    if "entity" not in df2019.loc[idx, "항목코드"]:
        df2019.loc[idx, "항목명"] = dic_re_name[re.findall("_\w{1,}", df2019.loc[idx, "항목코드"])[0]]
        df2019.loc[idx, "항목코드"] = dic_re_code[re.findall("_\w{1,}", df2019.loc[idx, "항목코드"])[0]]
        
# 필요 없는 변수 제거
index = df2019[df2019["항목명"] == "지우기"].index
df2019.drop(index, inplace = True)
df2019.reset_index(drop = True, inplace = True)

### 7. 정제된 데이터 프레임 만들기 

In [150]:
# DB 테이블에 적용할 D.F
# 회사이름 추출
lst_company = df2019["회사명"].unique()
element_value = list(new_criterion[new_criterion["개선항목명"] != "지우기"]["개선항목명"].unique())

# columns
col = list(df2019.columns[0:9]) + element_value
check_col = list(dic_element.keys())

#  해당년도 데이터 행길이
company_dic = {i:[] for i in lst_company}
day = []

for i in lst_company:
    company_dic[i].append(df2019[df2019["회사명"] == i]["결산기준일"].unique())
    day.append(len(df2019[df2019["회사명"] == i]["결산기준일"].unique()))
    
row = sum(day)

bon = np.zeros([row,len(col)]) + np.NAN

# D.F
last_df = pd.DataFrame(bon, columns = col)

# 합쳐야 할 변수 
sum_lst = list(new_criterion["개선항목명"].value_counts().head(10)[1:9].index)

### 8. 데이터 입력

In [151]:
dic_standard = {i:list(df2019[df2019["회사명"] == i]["결산기준일"].unique()) for i in lst_company}
last_company = []
last_standard  = []
for key, value in dic_standard.items():
    for index in range(len(value)):
        last_company.append(key)
        last_standard.append(value[index])
        
cnt = 0

for idx in range(len(last_company)):

    a = df2019[(df2019["회사명"] == last_company[idx]) & (df2019["결산기준일"] == last_standard[idx])]
    a.reset_index(drop = True, inplace = True)
    for i in range(9):
        last_df.iloc[cnt, i] = a.iloc[0, i]

    for idx3 in range(len(a)):
        if "entity" not in a["항목코드"][idx3]:
            loc = a["항목명"][idx3]
            if loc in sum_lst:
                last_df.loc[cnt, loc] += a["당기"][idx3]
            else:
                last_df.loc[cnt, loc] = a["당기"][idx3]
    cnt += 1

### 9. 2018데이터 입력

In [160]:
# 2018년 변수 처리
df = pd.read_csv("../data/2019_사업보고서_01_재무상태표_연결_20200623.txt", sep = "\t", encoding = "cp949")

def str_to_float(data):
    import numpy as np
    
    data["당기"] = data["당기"].str.replace(",", "")
    data["전기"] = data["전기"].str.replace(",", "")
    data["전전기"] = data["전전기"].str.replace(",", "")
    data["당기"] = data["당기"].astype(np.float32)
    data["전기"] = data["전기"].astype(np.float32)
    data["전전기"] = data["전전기"].astype(np.float32)
    return data

df = str_to_float(df)

# 필요 없는 변수 제거
def delete_col(data):
#     data.drop("통화", axis = 1, inplace = True)
    data.drop("Unnamed: 15", axis = 1, inplace = True)
    data.drop("당기", axis = 1, inplace = True)
    data.drop("전전기", axis = 1, inplace = True)
    data.drop("재무제표종류", axis = 1, inplace = True)
    return data

df = delete_col(df)

# 항목명 변수 전처리
# 항목명 전치리

# 1) _가진 항목명 추출
lst_ = []
for i in range(len(df)):
    if "_" in df.loc[i, "항목명"]:
        lst_.append(i)
        
def preprocessing(x):
    a = x.replace(" ", "")
    a = a.replace("_", "")
    a = a.replace("[", "")
    a = a.replace("]", "")
    a = a.replace("(", "")
    a = a.replace(")", "")
    a = a.replace(".", "")
    a = re.sub("[ⅠⅡⅢⅣⅤIII]", "", a)
    a = re.sub("[1-9]", "", a)
    
    
    return a

df["항목명"] = df["항목명"].agg(preprocessing)

lst_idx = []
for i in range(len(df)):
    if "총계" in df["항목명"][i] and "entity" in df["항목코드"][i]:
        lst_idx.append(i)
        
for i in lst_idx:
    if df["항목명"][i] == '부채총계':
        df["항목코드"][i] = "ifrs-full_Liabilities"
        
    elif df["항목명"][i] == '자본과부채의총계':
        df["항목코드"][i] = "ifrs-full_EquityAndLiabilities"
        
    elif df["항목명"][i] == '자산총계':
        df["항목코드"][i] = "ifrs-full_Assets"
        
    else:
        df["항목코드"][i] = "ifrs-full_Equity"
        
# entity 제외 항목명 변수처리
for idx in range(len(df)):
    if "entity" not in df["항목코드"][idx]:
        df["항목명"][idx] = dic_element[re.findall("_\w{1,}", df["항목코드"][idx])[0].lower()][0]

# entity 변수 전처리
elementid = pd.read_excel("../data/재무제표양식.xlsx", encoding = "utf-8", sheet_name = "BS1")
# '한글 Label', 'Element ID'
df_element = elementid[['한글 Label', 'Element ID']]
df_element = df_element[df_element["Element ID"].notnull()]

df_element.columns = df_element.columns.str.replace(" ", "_")
lst_element = df_element["Element_ID"].unique()

re_lst_element = []
for i in lst_element:
    a = re.findall("_\w{1,}", i)[0].lower()
    re_lst_element.append(a)
 
dic_elementid = {(((i.replace("_", "")).replace(" ", "")).replace("[", "")).replace("]", ""):j 
                 for i, j in zip(df_element["한글_Label"], df_element["Element_ID"])}

idx_entity = []
for idx in range(len(df)):
    if "entity" in df.loc[idx, "항목코드"]:
        idx_entity.append(idx)

# entity 포함 항목코드 항목코드 전처리
no_preprocessing_entity_idx = []
for idx in idx_entity:
    try:
        df.loc[idx, "항목코드"] = dic_elementid[df.loc[idx, "항목명"]]

    except Exception as e:
        no_preprocessing_entity_idx.append(idx)
        
# 한번더 전처리
for idx in range(len(df)):
    if "entity" not in df["항목코드"][idx]:
        df["항목명"][idx] = dic_element[re.findall("_\w{1,}", df["항목코드"][idx])[0].lower()][0]

        

# 항목명이 같은 entity 변수 전처리
lst = []
for idx in range(len(df)):
    if "entity" in df.loc[idx, "항목코드"]:
        lst.append(idx)
        
no_preprocessing = []
for idx in lst:
    try:
        df.loc[idx, "항목코드"] = dic_elementid[df.loc[idx, "항목명"]]
    except Exception as e:
        no_preprocessing.append(idx)
        

# 새기준 양식으로 항목명 
for idx in range(len(df)):
    if "entity" not in df.loc[idx, "항목코드"]:
        df.loc[idx, "항목명"] = dic_re_name[re.findall("_\w{1,}", df.loc[idx, "항목코드"])[0]]
        df.loc[idx, "항목코드"] = dic_re_code[re.findall("_\w{1,}", df.loc[idx, "항목코드"])[0]]
        
# 필요 없는 변수 제거
index = df[df["항목명"] == "지우기"].index
df.drop(index, inplace = True)
df.reset_index(drop = True, inplace = True)


# DB 테이블에 적용할 D.F
# 회사이름 추출
lst_company = df["회사명"].unique()
element_value = list(new_criterion[new_criterion["개선항목명"] != "지우기"]["개선항목명"].unique())

# columns
col = list(df.columns[0:9]) + element_value
check_col = list(dic_element.keys())

#  해당년도 데이터 행길이
company_dic = {i:[] for i in lst_company}
day = []

for i in lst_company:
    company_dic[i].append(df[df["회사명"] == i]["결산기준일"].unique())
    day.append(len(df[df["회사명"] == i]["결산기준일"].unique()))
    
row = sum(day)

bon = np.zeros([row,len(col)]) + np.NAN

# D.F
add_df = pd.DataFrame(bon, columns = col)

# 합쳐야 할 변수 
sum_lst = list(new_criterion["개선항목명"].value_counts().head(10)[1:9].index)


# 정제된 데이터프레임 생성
dic_standard = {i:list(df[df["회사명"] == i]["결산기준일"].unique()) for i in lst_company}
last_company = []
last_standard  = []
for key, value in dic_standard.items():
    for index in range(len(value)):
        last_company.append(key)
        last_standard.append(value[index])
        

cnt = 0
for idx in range(len(last_company)):

    a = df[(df["회사명"] == last_company[idx]) & (df["결산기준일"] == last_standard[idx])]
    a.reset_index(drop = True, inplace = True)
    for i in range(9):
        add_df.iloc[cnt, i] = a.iloc[0, i]

    for idx3 in range(len(a)):
        if "entity" not in a["항목코드"][idx3]:
            loc = a["항목명"][idx3]
            if loc in sum_lst:
                add_df.loc[cnt, loc] += a["전기"][idx3]
            else:
                add_df.loc[cnt, loc] = a["전기"][idx3]
    cnt += 1
    

add_df["결산기준일"] = add_df["결산기준일"].str.replace("2019", "2018")

last_df = pd.concat([last_df, add_df]).reset_index(drop = True)

In [157]:
add_df

Unnamed: 0,종목코드,회사명,시장구분,업종,업종명,결산월,결산기준일,보고서종류,통화,재무상태표 [abstract],...,퇴직급여운용자산/퇴직연금운용자산(부채),퇴직보험예치금(부채),이연법인세부채,기타비유동부채,비유동배출부채,부채총계,자본 [abstract],이익잉여금(결손금),자본총계,자본과부채총계
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687,,,,,,,,,,,...,,,,,,,,,,
1688,,,,,,,,,,,...,,,,,,,,,,
1689,,,,,,,,,,,...,,,,,,,,,,
1690,,,,,,,,,,,...,,,,,,,,,,


In [156]:
last_df.shape

(1692, 201)

In [None]:
last_df

### 10. 2017 데이터 입력

In [162]:
# 2018년 변수 처리
df = pd.read_csv("../data/2019_사업보고서_01_재무상태표_연결_20200623.txt", sep = "\t", encoding = "cp949")

def str_to_float(data):
    import numpy as np
    
    data["당기"] = data["당기"].str.replace(",", "")
    data["전기"] = data["전기"].str.replace(",", "")
    data["전전기"] = data["전전기"].str.replace(",", "")
    data["당기"] = data["당기"].astype(np.float32)
    data["전기"] = data["전기"].astype(np.float32)
    data["전전기"] = data["전전기"].astype(np.float32)
    return data

df = str_to_float(df)

# 필요 없는 변수 제거
def delete_col(data):
#     data.drop("통화", axis = 1, inplace = True)
    data.drop("Unnamed: 15", axis = 1, inplace = True)
    data.drop("당기", axis = 1, inplace = True)
    data.drop("전기", axis = 1, inplace = True)
    data.drop("재무제표종류", axis = 1, inplace = True)
    return data

df = delete_col(df)

# 항목명 변수 전처리
# 항목명 전치리

# 1) _가진 항목명 추출
lst_ = []
for i in range(len(df)):
    if "_" in df.loc[i, "항목명"]:
        lst_.append(i)
        
def preprocessing(x):
    a = x.replace(" ", "")
    a = a.replace("_", "")
    a = a.replace("[", "")
    a = a.replace("]", "")
    a = a.replace("(", "")
    a = a.replace(")", "")
    a = a.replace(".", "")
    a = re.sub("[ⅠⅡⅢⅣⅤIII]", "", a)
    a = re.sub("[1-9]", "", a)
    
    
    return a

df["항목명"] = df["항목명"].agg(preprocessing)

lst_idx = []
for i in range(len(df)):
    if "총계" in df["항목명"][i] and "entity" in df["항목코드"][i]:
        lst_idx.append(i)
        
for i in lst_idx:
    if df["항목명"][i] == '부채총계':
        df["항목코드"][i] = "ifrs-full_Liabilities"
        
    elif df["항목명"][i] == '자본과부채의총계':
        df["항목코드"][i] = "ifrs-full_EquityAndLiabilities"
        
    elif df["항목명"][i] == '자산총계':
        df["항목코드"][i] = "ifrs-full_Assets"
        
    else:
        df["항목코드"][i] = "ifrs-full_Equity"
        
# entity 제외 항목명 변수처리
for idx in range(len(df)):
    if "entity" not in df["항목코드"][idx]:
        df["항목명"][idx] = dic_element[re.findall("_\w{1,}", df["항목코드"][idx])[0].lower()][0]

# entity 변수 전처리
elementid = pd.read_excel("../data/재무제표양식.xlsx", encoding = "utf-8", sheet_name = "BS1")
# '한글 Label', 'Element ID'
df_element = elementid[['한글 Label', 'Element ID']]
df_element = df_element[df_element["Element ID"].notnull()]

df_element.columns = df_element.columns.str.replace(" ", "_")
lst_element = df_element["Element_ID"].unique()

re_lst_element = []
for i in lst_element:
    a = re.findall("_\w{1,}", i)[0].lower()
    re_lst_element.append(a)
 
dic_elementid = {(((i.replace("_", "")).replace(" ", "")).replace("[", "")).replace("]", ""):j 
                 for i, j in zip(df_element["한글_Label"], df_element["Element_ID"])}

idx_entity = []
for idx in range(len(df)):
    if "entity" in df.loc[idx, "항목코드"]:
        idx_entity.append(idx)

# entity 포함 항목코드 항목코드 전처리
no_preprocessing_entity_idx = []
for idx in idx_entity:
    try:
        df.loc[idx, "항목코드"] = dic_elementid[df.loc[idx, "항목명"]]

    except Exception as e:
        no_preprocessing_entity_idx.append(idx)
        
# 한번더 전처리
for idx in range(len(df)):
    if "entity" not in df["항목코드"][idx]:
        df["항목명"][idx] = dic_element[re.findall("_\w{1,}", df["항목코드"][idx])[0].lower()][0]

        

# 항목명이 같은 entity 변수 전처리
lst = []
for idx in range(len(df2019)):
    if "entity" in df.loc[idx, "항목코드"]:
        lst.append(idx)
        
no_preprocessing = []
for idx in lst:
    try:
        df.loc[idx, "항목코드"] = dic_elementid[df.loc[idx, "항목명"]]
    except Exception as e:
        no_preprocessing.append(idx)
        

# 새기준 양식으로 항목명 
for idx in range(len(df)):
    if "entity" not in df.loc[idx, "항목코드"]:
        df.loc[idx, "항목명"] = dic_re_name[re.findall("_\w{1,}", df.loc[idx, "항목코드"])[0]]
        df.loc[idx, "항목코드"] = dic_re_code[re.findall("_\w{1,}", df.loc[idx, "항목코드"])[0]]
        
# 필요 없는 변수 제거
index = df[df["항목명"] == "지우기"].index
df.drop(index, inplace = True)
df.reset_index(drop = True, inplace = True)


# DB 테이블에 적용할 D.F
# 회사이름 추출
lst_company = df["회사명"].unique()
element_value = list(new_criterion[new_criterion["개선항목명"] != "지우기"]["개선항목명"].unique())

# columns
col = list(df.columns[0:9]) + element_value
check_col = list(dic_element.keys())

#  해당년도 데이터 행길이
company_dic = {i:[] for i in lst_company}
day = []

for i in lst_company:
    company_dic[i].append(df[df["회사명"] == i]["결산기준일"].unique())
    day.append(len(df2019[df["회사명"] == i]["결산기준일"].unique()))
    
row = sum(day)

bon = np.zeros([row,len(col)]) + np.NAN

# D.F
add_df = pd.DataFrame(bon, columns = col)

sum_lst = list(new_criterion["개선항목명"].value_counts().head(10)[1:9].index)


# 정제된 데이터프레임 생성
dic_standard = {i:list(df[df["회사명"] == i]["결산기준일"].unique()) for i in lst_company}
last_company = []
last_standard  = []
for key, value in dic_standard.items():
    for index in range(len(value)):
        last_company.append(key)
        last_standard.append(value[index])
            
cnt = 0
for idx in range(len(last_company)):

    a = df[(df["회사명"] == last_company[idx]) & (df["결산기준일"] == last_standard[idx])]
    a.reset_index(drop = True, inplace = True)
    for i in range(9):
        add_df.iloc[cnt, i] = a.iloc[0, i]

    for idx3 in range(len(a)):
        if "entity" not in a["항목코드"][idx3]:
            loc = a["항목명"][idx3]
            if loc in sum_lst:
                add_df.loc[cnt, loc] += a["전전기"][idx3]
            else:
                add_df.loc[cnt, loc] = a["전전기"][idx3]
    cnt += 1

add_df["결산기준일"] = add_df["결산기준일"].str.replace("2019", "2017")

last_df = pd.concat([last_df, add_df]).reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [163]:
last_df

Unnamed: 0,종목코드,회사명,시장구분,업종,업종명,결산월,결산기준일,보고서종류,통화,재무상태표 [abstract],...,퇴직급여운용자산/퇴직연금운용자산(부채),퇴직보험예치금(부채),이연법인세부채,기타비유동부채,비유동배출부채,부채총계,자본 [abstract],이익잉여금(결손금),자본총계,자본과부채총계
0,[060310],3S,코스닥시장상장법인,292.0,특수 목적용 기계 제조업,3.0,2019-03-31,사업보고서,KRW,,...,,,,,,1.895754e+10,,-3.167278e+10,3.437168e+10,5.332922e+10
1,[095570],AJ네트웍스,유가증권시장상장법인,763.0,산업용 기계 및 장비 임대업,12.0,2019-12-31,사업보고서,KRW,,...,,,9.811366e+08,5.636559e+09,,1.455914e+12,,1.987718e+11,3.473412e+11,1.803255e+12
2,[006840],AK홀딩스,유가증권시장상장법인,649.0,기타 금융업,12.0,2019-12-31,사업보고서,KRW,,...,,,3.628085e+10,1.021370e+10,,2.898026e+12,,4.964832e+11,1.429939e+12,4.327965e+12
3,[054620],APS홀딩스,코스닥시장상장법인,649.0,기타 금융업,12.0,2019-12-31,사업보고서,KRW,,...,,,3.651808e+09,1.749345e+08,,8.292250e+10,,7.342969e+11,2.147857e+11,2.977083e+11
4,[265520],AP시스템,코스닥시장상장법인,292.0,특수 목적용 기계 제조업,12.0,2019-12-31,사업보고서,KRW,,...,,,,,,3.068430e+11,,4.531593e+10,1.141229e+11,4.209659e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5071,[069260],휴켐스,유가증권시장상장법인,204.0,기타 화학제품 제조업,12.0,2017-12-31,사업보고서,KRW,,...,,,2.344200e+08,0.000000e+00,,2.827643e+11,,4.087101e+11,5.988149e+11,8.815793e+11
5072,[010240],흥국,코스닥시장상장법인,292.0,특수 목적용 기계 제조업,12.0,2017-12-31,사업보고서,KRW,,...,,,1.959252e+09,,,2.277970e+10,,4.743681e+10,5.525575e+10,7.803545e+10
5073,[189980],흥국에프엔비,코스닥시장상장법인,112.0,비알코올음료 및 얼음 제조업,12.0,2017-12-31,사업보고서,KRW,,...,,,,,,1.281639e+10,,2.903353e+10,5.741013e+10,7.022653e+10
5074,[003280],흥아해운,유가증권시장상장법인,501.0,해상 운송업,12.0,2017-12-31,사업보고서,KRW,,...,,,2.155102e+10,0.000000e+00,,7.374264e+11,,4.906098e+10,1.359827e+11,8.734092e+11


In [None]:
add_df

In [164]:
last_df.to_csv("../data/2017_2019_without_entity.csv", encoding = "euc-kr", index = False)