In [1]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import pandas as pd
# 모든 행을 출력하도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 기본값으로 설정 (처음 5개와 마지막 5개 행만 출력)
pd.reset_option('display.max_rows')

# 출력 포맷 설정 (소수점 4자리까지)
pd.options.display.float_format = '{:.4f}'.format

import platform
import seaborn as sns

import matplotlib.pyplot as plt

# 운영 체제 확인
if platform.system() == 'Darwin':  # Mac
    print('apple gothic')
    font_name = 'AppleGothic'
elif platform.system() == 'Windows':  # Windows
    font_name = 'NanumGothic'
else:
    font_name = None

# 한글 폰트 설정
if font_name:
    plt.rcParams['font.family'] = font_name

# 마이너스 부호 설정
plt.rcParams['axes.unicode_minus'] = False

## 데이터 로드

In [2]:
# df = pd.read_csv('../data/5_데이터셋생성_재무.csv', dtype={'거래소코드' : 'object'})
train = pd.read_csv('../data/dataset/train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
test = pd.read_csv('../data/dataset/test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

X_train = pd.read_csv('../data/dataset/X_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
X_test = pd.read_csv('../data/dataset/X_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

y_train = pd.read_csv('../data/dataset/y_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
y_test = pd.read_csv('../data/dataset/y_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1187, 224), (198, 224), (1187, 1), (198, 1))

In [3]:
X_train.head(2)

Unnamed: 0_level_0,회사명,회계년도,상장일,상장폐지일,결산년도,상장년도,market,자산,유형자산,유동자산,재고자산,자본,매출액,매출원가,판매비와 관리비(물류원가 등 포함),기타(영업)비용,당기제조원가,급여,퇴직급여,상품매출원가,제품매출원가,감가상각비,대손상각비,개발비상각,기타무형자산상각비,외환차익1,외화환산이익1,외환차손1,외화환산손실1,외환차익2,외화환산이익2,외환차손2,외환환산손실2,무형자산,건설중인자산,토지,자본잉여금,이익잉여금(결손금),미처분이익잉여금(결손금),기타임의적립금,(연차배당),(중간배당),배당금지급(-),단기차입금,매입채무 및 기타유동채무,매출채권 및 기타유동채권,비유동자산,사채,유동금융자산,유동부채,유동성장기부채,장기매입채무 및 기타비유동채무,장기매출채권 및 기타비유동채권,장기차입금,현금및현금성자산,영업활동으로 인한 현금흐름,유형자산의 증가,무형자산의 증가,연구개발비,종업원수,비유동부채,부채,유형자산의증가,유형자산의감소,(투자활동으로 인한 현금유출액),투자활동으로 인한 현금유입액,자본금,관계기업 등 지분관련 투자자산,영업손익,이자보상배율,년도차,자산_전기,자산증가율,유형자산_전기,유형자산증가율,유동자산_전기,유동자산증가율,재고자산_전기,재고자산증가율,자본_전기,자본증가율,매출액_전기,매출액증가율,자산평균,유형자산평균,무형자산_전기,무형자산평균,건설중인자산_전기,건설중인자산평균,토지_전기,토지평균,자본평균,총자산회전율,감가상각율,배당률,판관비체크,순외환손익대비매출액,매출원가대비매출액,당기총제조비용,재고조정중의고정비,고정비,총비용,변동비,변동비대비매출액,고정비대비매출액,인건비대비매출액,인건비대비영업총비용,1인당매출액,1인당인건비,재고자산평균,비유동자산_전기,비유동자산평균,매출채권 및 기타유동채권평균,장기매출채권 및 기타비유동채권평균,매입채무 및 기타유동채무평균,장기매입채무 및 기타비유동채무평균,주주이익(버핏),순운전자본(민식),유무형자산,영업현금흐름대비투하자본,ROTCE현금흐름대체,유형자산대비현금흐름,유무형자산대비현금흐름,매출대비고정자산,단기차입금_전기,단기차입금평균,장기차입금_전기,장기차입금평균,유동비율,현금비율,당좌비율,자기자본비율,비유동비율,부채비율,유동부채비율,비유동부채비율,비유동장기적합률,차입금의존도,매출채권대비매입채무,순운전자본,순운전자본대비총자본,차입금대비매출액,매출채권대비매입채무_before,영업활동현금흐름 대 총자산,영업활동현금흐름 대 매출액,투자안정성비율,영업활동현금흐름 대 투자활동현금지출,경영자산회전율분모,매출채권,매입채무,차입금,경영자산회전율분모_전기,매출채권_전기,매입채무_전기,차입금_전기,부채_전기,자본금_전기,영업활동현금흐름 대 총부채,자기자본회전율,자본금회전율,경영자산회전율,비유동자산회전율,유형자산회전율,재고자산회전율,매출채권회전율,매입채무회전율,차입금 대 매출액,시가총액,상장주식수,배당수익률,DATE,DGS10,DGS1,DGS6MO,T10Y2Y,WTI_oil,Dubai_oil,realGDP_usa,real_PCE,Core_PCE,CPI_sticky,Core_CPI,manuf_PMI(R),non_manuf_PMI(R),manuf_GB,non_manuf_GB,Equip_inv,TB_rtn(1y),TB_rtn(10y),IR_sm,DXY,CB_spread(AA-),CB_spread(BBB-),경제고통지수,경상수지,수입금액지수,수입물량지수,수출금액지수,수출물량지수,미국수입금액,중국수입금액,미국수출금액,중국수출금액,실업률,실업률_증감,소비자물가상승률,근원물가상승률,근원물가상승률_식품에너지제외,GDP_growth,CLI(경기선행),CCI(경기동행),NSI(=뉴스심리지수),기업실사BSI_실적,기업실사BSI_전망,year,month,매출액정상영업이익률,매출액순이익률,매출액총이익률,총자본순이익률,총자본사업이익률,총자본정상영업이익률,자기자본정상영업이익률,자기자본순이익률
거래소코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1
40,케이알모터스(주),2012/12,1976/05/25,,2012,1976,1,119327267.0,55810192.0,61684604.0,24763334.0,60797171.0,97926654.0,92177389.0,11948008.0,0.0,0.0,2579833.0,350950.0,0.0,0.0,35985.0,1248799.0,0.0,650714.0,155998.0,0.0,470896.0,1310924.0,0.0,0.0,0.0,0.0,1118203.0,0.0,42542501.0,3884892.0,7235571.0,12088682.0,0.0,0.0,0.0,-0.0,10401663.0,28964045.0,26201155.0,57642663.0,0.0,100000.0,43762954.0,0.0,407646.0,342768.0,0.0,10236751.0,-4565749.0,937241.0,42700.0,0.0,249.0,14767142.0,58530096.0,937241.0,21084.0,992041.0,464863.0,59670690.0,0.0,-6198743.0,-6.47,1.0,133337774.0,-10.5075,57761033.0,-3.3774,73022140.0,-15.5262,24384487.0,1.5536,68764654.0,-11.5866,127720215.0,-23.3272,126332520.5,56785612.5,1804381.0,1461292.0,0.0,0.0,42542501.0,42542501.0,64780912.5,0.7752,0.0972,-0.0,True,-13.6075,94.129,14842806.0,7635041.5537,11035931.0537,104125397.0,93089465.9463,95.0604,11.2696,2.9928,24.5295,393280.0,10361.0,24573910.5,60315635.0,58979149.0,26201155.0,342768.0,28964045.0,407646.0,-4858991.0,51117833.5,58246904.5,0.0,0.0,0.0,0.0,0.0,10304725.0,10353194.0,0.0,0.0,140.9516,23.3914,23.6199,50.9499,94.8114,96.2711,71.9819,24.2892,76.2829,8.7169,90.3725,17921650.0,15.0189,10.5724,90.3725,0.0,0.0,0.0,0.0,119327267.0,26543923.0,29371691.0,10401663.0,133337774.0,33214201.0,35842471.0,10304725.0,64573121.0,59670690.0,0.0,1.5117,1.6411,0.7752,1.6604,1.7245,3.985,3.2774,3.0032,10.5724,58954641.226,119341379.0,-0.0,2012-12-01,1.7274,0.1675,0.1282,1.4629,94.2008,3.3083,2.2891,1.3673,1.85,-139.727,0.7093,50.7,56.1,1,1,-9.4,3.1237,3.4437,5.22,2.5615,0.6325,6.2021,4.5,1.9324,-5.6,-1.7,-2.4,-1.1,-16.6,-6.3,-1.0,2.7,3.1,-0.1,1.4,1.1,1.3,0.0198,-0.1,-0.5,1.87,-14,-17,2012,12,-6.33,-8.41,5.87,-6.52,-4.72,-4.91,-9.57,-10.99
50,(주)경방,2012/12,1956/03/03,,2012,1956,1,1335039930.0,324064120.0,123685731.0,52907652.0,620704205.0,333620804.0,260837041.0,65529821.0,0.0,0.0,5063975.0,283246.0,0.0,0.0,242763.0,290815.0,0.0,84337.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4596438.0,11609300.0,230042409.0,16168530.0,597426197.0,-53218658.0,51600000.0,0.0,0.0,-0.0,99503292.0,38888501.0,41988837.0,1211354199.0,0.0,2446000.0,164949423.0,18312473.0,91596372.0,902347.0,297503511.0,11322343.0,17789061.0,27830790.0,0.0,1826.0,579.0,549386302.0,714335725.0,27830790.0,1268120.0,30797266.0,3099391.0,10400000.0,766920.0,7253943.0,0.36,1.0,1352324332.0,-1.2781,308454057.0,5.0607,138673371.0,-10.8079,72996830.0,-27.5206,629704391.0,-1.4293,348136369.0,-4.1695,1343682131.0,316259088.5,5316133.0,4956285.5,1268081.0,6438690.5,230340909.0,230191659.0,625204298.0,0.2483,0.1195,-0.0,True,0.0,78.1837,70634279.0,7199452.0056,10490977.5056,326366862.0,315875884.4944,94.6811,3.1446,1.6028,8.16,576202.0,8746.0,62952241.0,1213650961.0,1212502580.0,41988837.0,902347.0,38888501.0,91596372.0,-9714629.0,105843425.0,321215374.0,0.0417,0.0135,0.0562,0.0554,0.0147,151249110.0,125376201.0,52825075.0,175164293.0,74.984,6.8641,8.347,46.4933,195.158,115.0847,26.5746,88.5102,103.5265,31.1091,32.8706,-41263692.0,0.0,90.0845,32.8706,1.3325,5.3321,66.9702,64.2254,1322663710.0,42891184.0,130484873.0,397006803.0,1350177669.0,45580150.0,133117379.0,204074185.0,722619941.0,10400000.0,2.4759,0.5336,32.0789,0.2496,0.2752,1.0549,5.2996,7.5419,2.5312,90.0845,186368000.0,2080000.0,-0.0,2012-12-01,1.7274,0.1675,0.1282,1.4629,94.2008,3.3083,2.2891,1.3673,1.85,-139.727,0.7093,50.7,56.1,1,1,-9.4,3.1237,3.4437,5.22,2.5615,0.6325,6.2021,4.5,1.9324,-5.6,-1.7,-2.4,-1.1,-16.6,-6.3,-1.0,2.7,3.1,-0.1,1.4,1.1,1.3,0.0198,-0.1,-0.5,1.87,-14,-17,2012,12,2.17,-2.37,21.82,-0.59,0.56,0.54,1.16,-1.26


## 불필요 컬럼 제외

In [6]:
cols_info = ['회사명', '회계년도']

cols_info_drop = ['상장일', '상장폐지일',
    '결산년도', '상장년도', '년도차', 'year', 'month', 'DATE']

cols_drop = ['판관비체크', '매출채권대비매입채무_before',
    '배당률', '차입금 대 매출액']

cols_labeling = ['이자보상배율']

cols_before = [col for col in X_train.columns if col.endswith('전기')]

cols_mean = [col for col in X_train.columns if col.endswith('평균')]

cols_nonfeatures = cols_info_drop + cols_drop + cols_labeling + cols_before + cols_mean

In [7]:
# 후보피처 아닌 컬럼 제거

train = train.drop(columns=cols_nonfeatures)
test = test.drop(columns=cols_nonfeatures)

X_train = X_train.drop(columns=cols_nonfeatures)
X_test = X_test.drop(columns=cols_nonfeatures)

In [21]:
# 유일값이 1개인 컬럼 확인

cols_count1 = []

for col in train.columns:
    if train[col].nunique() == 1:
        cols_count1.append(col)

cols_count1

['외환차익2', '외화환산이익2', '외환차손2', '외환환산손실2', 'non_manuf_GB']

In [None]:
# 유일값이 1개인 컬럼 제거

train = train.drop(columns=cols_count1)
test = test.drop(columns=cols_count1)

X_train = X_train.drop(columns=cols_count1)
X_test = X_test.drop(columns=cols_count1)