# 결과정리2 - 키워드만

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 한글 폰트 사용을 위해서 세팅
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/malgun.ttf"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

import warnings 
warnings.filterwarnings('ignore')
import os 
import re
import FinanceDataReader as fdr
import time

pd.set_option('display.max.colwidth', 70)

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, f1_score, roc_auc_score ,accuracy_score, precision_score, recall_score, confusion_matrix
import joblib 
from sklearn.preprocessing import MinMaxScaler

rfc = RandomForestClassifier()
xgb = XGBClassifier()
cat = CatBoostClassifier()
models = [rfc,xgb,cat]

In [3]:
import FinanceDataReader as fdr

# 함수

## 전처리 함수

In [4]:
df_kospi = pd.read_csv('./data/recent_kospi_list.csv',index_col=0)
# 코드 반환
def corp_code(corp_name):
    code = df_kospi[df_kospi['Name']==corp_name]['Symbol'].iloc[0]
    code = str(code).zfill(6)
    return code

# 주가데이터
def stock_price(code,bgn_date= '2016-01-01',end_date= '2022-03-31'):

    df_p = fdr.DataReader(code,bgn_date,end_date)
    df_p = df_p.reset_index()
    df_p = df_p.rename(columns = {'Date': '날짜'})
    df_p = df_p.set_index('날짜',drop=True)
    
    return df_p

# merge
def merge(df_count,df_p):
    df_merge = pd.merge(df_count,df_p['Close'],left_index=True,right_index=True, how='right')
    return df_merge

from sklearn.preprocessing import MinMaxScaler

def mscaler(df):

    df.index = pd.DatetimeIndex(df.index)
    col_list = df.columns
    scaler = MinMaxScaler()
    df_scaled = df.copy()
    df_scaled[col_list] = scaler.fit_transform(df[col_list])
    
    return df_scaled

def add_target(df_merge,window_size, period_rate):

    earning_lst = []
    for i in range(len(df_merge)-window_size):
        
        earning_rate = (df_merge.iloc[i+window_size,-1]- df_merge.iloc[i,-1])/df_merge.iloc[i,-1]

        if earning_rate >= period_rate:
            earning_lst.append(1)
        else:
            earning_lst.append(0)

    df_model = df_merge.iloc[:-window_size,:]
    df_model['target'] = earning_lst
    
    return df_model

def feature_visualization(save_path,num):
    
    model = joblib.load(save_path)
    # 배열형태로 반환
    ft_importance_values = model.feature_importances_

    # 정렬과 시각화를 쉽게 하기 위해 series 전환
    ft_series = pd.Series(ft_importance_values, index = x_test.columns)
    ft_top20 = ft_series.sort_values(ascending=False)[:num]

    # 시각화
    plt.figure(figsize=(8,6))
    plt.title('Feature Importance Top 20')
    sns.barplot(x=ft_top20, y=ft_top20.index)
    plt.show()
    
    return ft_top20

## test_result()

In [5]:
def test_result(month,period_rate):

    window_size = 21*month
    
    if month ==3:
        test_stdate = '2021-07-01'
    else:
        test_stdate = '2021-04-01'

    path = './data/데이터_뉴스키워드빈도/'
    file_list = os.listdir(path)

    models_path = f'./data/machine_model3_{month}개월_{period_rate}/'
    saved_model_list = os.listdir(models_path)
    m_corp_lst =[]
    for save_model in saved_model_list:
        m_corp = save_model.split('_')[0]
        m_corp_lst.append(m_corp)


    corp_list =[]
    model_list = []
    accu_list = []
    precision_list = []
    recall_list = []
    roc_list = []
    model_path_list = []


    for file in file_list[:]:
        corp_name = file[:-4]
        if corp_name in m_corp_lst:
            code = corp_code(corp_name)
            df_p = stock_price(code)

            file_path = os.path.join(path,file)
            df_count = pd.read_csv(file_path,index_col=0)
            df_count.index = pd.DatetimeIndex(df_count.index)
            last_col = df_count.columns[-1]

            df_count = mscaler(df_count)

            df_merge = merge(df_count,df_p)

            df_model = add_target(df_merge,window_size=window_size, period_rate= period_rate)

            x_test = df_model.loc[test_stdate:,:last_col]
            y_test = df_model.loc[test_stdate:,'target']


            for save_model in saved_model_list:
                m_corp = save_model.split('_')[0]
                if m_corp == corp_name:
                    model_path = os.path.join(models_path,save_model)
                    #모델 불러오기
                    model = joblib.load(model_path)
                    model_name = model.__class__.__name__
                    pred = model.predict(x_test)
                    # 결과확인
                    corp_list.append(corp_name)
                    model_list.append(model_name)
                    accuracy = accuracy_score(y_test, pred)
                    accu_list.append(accuracy)

                    proba = model.predict_proba(x_test)[:,1]
                    try:
                        roc_auc = roc_auc_score(y_test, proba)
                    except:
                        roc_auc = 0

                    roc_list.append(roc_auc)

                    precision = precision_score(y_test, pred)
                    precision_list.append(precision)

                    recall = recall_score(y_test, pred)
                    recall_list.append(recall)
                    
                    model_path_list.append(model_path)

    df_result = pd.DataFrame({'회사이름':corp_list, '모델이름':model_list,'accuracy':accu_list,
                      'precision':precision_list,'recall':recall_list,'roc_auc':roc_list,'모델주소':model_path_list})

#     df_result.to_csv(f'./data/model_result_test/machine_model3_{window_size}일_{period_rate}.csv')
    
    return df_result

# 모델결과 보기

## 3개월 5%

In [5]:
result_3_5 = pd.read_csv('./data/model_result_val/machine_model3_63일_0.05.csv',index_col=0)
result_3_5

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
0,AJ네트웍스,CatBoostClassifier,0.298387,0.571429,0.045455,0.400884,./data/machine_model3_3개월_0.05/AJ네트웍스_CatBoostClassifier.pkl
1,CJ CGV,RandomForestClassifier,0.540323,0.560000,0.444444,0.531876,./data/machine_model3_3개월_0.05/CJ CGV_RandomForestClassifier.pkl
2,CJ CGV,XGBClassifier,0.508065,0.527778,0.301587,0.583008,./data/machine_model3_3개월_0.05/CJ CGV_XGBClassifier.pkl
3,CJ제일제당,RandomForestClassifier,0.572581,0.615385,0.271186,0.544329,./data/machine_model3_3개월_0.05/CJ제일제당_RandomForestClassifier.pkl
4,CJ제일제당,XGBClassifier,0.548387,0.578947,0.186441,0.582790,./data/machine_model3_3개월_0.05/CJ제일제당_XGBClassifier.pkl
...,...,...,...,...,...,...,...
440,휠라홀딩스,RandomForestClassifier,0.516129,0.520325,0.984615,0.472099,./data/machine_model3_3개월_0.05/휠라홀딩스_RandomForestClassifier.pkl
441,휠라홀딩스,CatBoostClassifier,0.524194,0.524590,0.984615,0.484876,./data/machine_model3_3개월_0.05/휠라홀딩스_CatBoostClassifier.pkl
442,휴스틸,RandomForestClassifier,0.241935,0.666667,0.061856,0.526919,./data/machine_model3_3개월_0.05/휴스틸_RandomForestClassifier.pkl
443,휴스틸,XGBClassifier,0.233871,0.625000,0.051546,0.533792,./data/machine_model3_3개월_0.05/휴스틸_XGBClassifier.pkl


In [9]:
month = 3
window_size = 21*month
period_rate= 0.05

path = './data/데이터_뉴스키워드빈도/'
file_list = os.listdir(path)

models_path = './data/machine_model3_3개월_0.05/'
saved_model_list = os.listdir(models_path)
m_corp_lst =[]
for save_model in saved_model_list:
    m_corp = save_model.split('_')[0]
    m_corp_lst.append(m_corp)
    

corp_list =[]
model_list = []
accu_list = []
precision_list = []
recall_list = []
roc_list = []
model_path = []

    
for file in file_list[:]:
    corp_name = file[:-4]
    if corp_name in m_corp_lst:
        code = corp_code(corp_name)
        df_p = stock_price(code)

        file_path = os.path.join(path,file)
        df_count = pd.read_csv(file_path,index_col=0)
        df_count.index = pd.DatetimeIndex(df_count.index)
        last_col = df_count.columns[-1]

        df_count = mscaler(df_count)

        df_merge = merge(df_count,df_p)

        df_model = add_target(df_merge,window_size=window_size, period_rate= period_rate)

        x_train = df_model.loc['2016-05-02':'2020-12-31',:last_col]
        y_train = df_model.loc['2016-05-02':'2020-12-31','target']

        x_val = df_model.loc['2021-01-04':'2021-07-01',:last_col]
        y_val = df_model.loc['2021-01-04':'2021-07-01','target']

        x_test = df_model.loc['2021-07-01':,:last_col]
        y_test = df_model.loc['2021-07-01':,'target']
        

        for save_model in saved_model_list:
            m_corp = save_model.split('_')[0]
            if m_corp == corp_name:
                print(m_corp)
                model_path = os.path.join(models_path,save_model)
                #모델 불러오기
                model = joblib.load(model_path)
                model_name = model.__class__.__name__
                pred = model.predict(x_test)
                # 결과확인
                corp_list.append(corp_name)
                model_list.append(model_name)
                accuracy = accuracy_score(y_test, pred)
                accu_list.append(accuracy)
                
                proba = model.predict_proba(x_test)[:,1]
                try:
                    roc_auc = roc_auc_score(y_test, proba)
                except:
                    roc_auc = 0
                    
                roc_list.append(roc_auc)
                
                precision = precision_score(y_test, pred)
                precision_list.append(precision)
                
                recall = recall_score(y_test, pred)
                recall_list.append(recall)
                
df_result = pd.DataFrame({'회사이름':corp_list, '모델이름':model_list,'accuracy':accu_list,
                  'precision':precision_list,'recall':recall_list,'roc_auc':roc_list,'모델주소':model_path})

df_result.to_csv(f'./data/model_result_test/machine_model3_{window_size}일_{period_rate}.csv')

AJ네트웍스
CJ CGV
CJ CGV
CJ제일제당
CJ제일제당
DB
DB
DB
DB하이텍
DB하이텍
DB하이텍
DI동일
DI동일
DI동일
DSR
E1
GS글로벌
GS글로벌
GS글로벌
HDC
HDC
HMM
HMM
HMM
HSD엔진
KG케미칼
KG케미칼
KG케미칼
KISCO홀딩스
KISCO홀딩스
KISCO홀딩스
KSS해운
KSS해운
KT
KT
KT
KTcs
KTcs
LG
LG
LG유플러스
LG유플러스
LG이노텍
LG이노텍
LG이노텍
LIG넥스원
LIG넥스원
LIG넥스원
LS네트웍스
LS네트웍스
LS네트웍스
MH에탄올
NAVER
NAVER
NAVER
NPC
NPC
S-Oil
S-Oil
S-Oil
SBS
SBS
SBS
SGC에너지
SGC에너지
SGC에너지
SG세계물산
SJM
SKC
SKC
SKC
SK가스
SK가스
SK가스
SK네트웍스
SK네트웍스
SK텔레콤
SK텔레콤
SPC삼립
STX엔진
STX엔진
TCC스틸
강남제비스코
강남제비스코
경동나비엔
계룡건설
계룡건설
계룡건설
계양전기
계양전기
고려아연
고려아연
고려제강
광명전기
광명전기
광명전기
광전자
광전자
국도화학
국도화학
국도화학
금강공업
금강공업
금강공업
금양
금양
금호건설
금호건설
금호타이어
넥센
넥센
넥센타이어
넥센타이어
넥센타이어
농심
농심
대동
대동
대동
대우건설
대우건설
대우건설
대웅제약
대원전선
대원전선
대원전선
대원화성
대원화성
대창
대창
대창
대창단조
대창단조
대한전선
대한전선
대한제강
대한제강
대한제강
대한제분
대한제분
대한항공
대한항공
대한항공
덕성
덕성
도화엔지니어링
동국제강
동국제강
동국제강
동부건설
동부건설
동부건설
동아쏘시오홀딩스
동아지질
동아지질
동양
동양
동양
동양철관
동양철관
동원F&B
동일산업
동일산업
동일산업
두산
두산
두산
디아이씨
디아이씨
디아이씨
롯데관광개발
롯데관광개발
롯데관광개발
롯데정밀화학
롯데정밀화학
롯데정밀화학
롯데칠성
롯데칠성
롯데칠성
모나미
무림페이퍼
무학
무학
미원상사
미원상사
미원상사
백산
백산
범양건영
범양건영
벽산
벽산
벽산
보령
보령
비상교육
비상교육
사

In [6]:
df_result = test_result(3,0.05)
df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
8,DB하이텍,CatBoostClassifier,0.603306,0.605769,0.9,0.477871,./data/machine_model3_3개월_0.05/DB하이텍_CatBoostClassifier.pkl
9,DB하이텍,RandomForestClassifier,0.479339,0.559322,0.471429,0.464986,./data/machine_model3_3개월_0.05/DB하이텍_RandomForestClassifier.pkl
10,DB하이텍,XGBClassifier,0.46281,0.540984,0.471429,0.463725,./data/machine_model3_3개월_0.05/DB하이텍_XGBClassifier.pkl
24,HSD엔진,RandomForestClassifier,0.495868,0.504348,0.935484,0.494806,./data/machine_model3_3개월_0.05/HSD엔진_RandomForestClassifier.pkl
42,LG이노텍,CatBoostClassifier,0.603306,0.75,0.734043,0.475374,./data/machine_model3_3개월_0.05/LG이노텍_CatBoostClassifier.pkl
43,LG이노텍,RandomForestClassifier,0.545455,0.76,0.606383,0.500197,./data/machine_model3_3개월_0.05/LG이노텍_RandomForestClassifier.pkl
44,LG이노텍,XGBClassifier,0.561983,0.746988,0.659574,0.455871,./data/machine_model3_3개월_0.05/LG이노텍_XGBClassifier.pkl
76,SK텔레콤,CatBoostClassifier,0.61157,0.529412,0.1875,0.466039,./data/machine_model3_3개월_0.05/SK텔레콤_CatBoostClassifier.pkl
81,TCC스틸,XGBClassifier,0.305785,0.619048,0.146067,0.459621,./data/machine_model3_3개월_0.05/TCC스틸_XGBClassifier.pkl
96,광전자,CatBoostClassifier,0.545455,0.75,0.101695,0.528431,./data/machine_model3_3개월_0.05/광전자_CatBoostClassifier.pkl


In [7]:
df_result.to_csv(f'./data/model_result_test/machine_model3_63일_0.05.csv')

## 3개월 10%

In [8]:
result_3_10 = pd.read_csv('./data/model_result_val/machine_model3_63일_0.1.csv',index_col=0)
result_3_10

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
0,AJ네트웍스,RandomForestClassifier,0.379032,0.666667,0.050633,0.411533,./data/machine_model3_3개월_0.1/AJ네트웍스_RandomForestClassifier.pkl
1,DB,RandomForestClassifier,0.524194,0.578947,0.619718,0.489636,./data/machine_model3_3개월_0.1/DB_RandomForestClassifier.pkl
2,DB,XGBClassifier,0.564516,0.626866,0.591549,0.550757,./data/machine_model3_3개월_0.1/DB_XGBClassifier.pkl
3,DB,CatBoostClassifier,0.508065,0.569444,0.577465,0.458278,./data/machine_model3_3개월_0.1/DB_CatBoostClassifier.pkl
4,E1,RandomForestClassifier,0.306452,0.700000,0.077778,0.395752,./data/machine_model3_3개월_0.1/E1_RandomForestClassifier.pkl
...,...,...,...,...,...,...,...
243,효성,CatBoostClassifier,0.298387,0.852941,0.261261,0.448371,./data/machine_model3_3개월_0.1/효성_CatBoostClassifier.pkl
244,휠라홀딩스,RandomForestClassifier,0.508065,0.538462,0.222222,0.513271,./data/machine_model3_3개월_0.1/휠라홀딩스_RandomForestClassifier.pkl
245,휴스틸,RandomForestClassifier,0.258065,0.750000,0.031915,0.489539,./data/machine_model3_3개월_0.1/휴스틸_RandomForestClassifier.pkl
246,휴스틸,XGBClassifier,0.266129,0.800000,0.042553,0.508688,./data/machine_model3_3개월_0.1/휴스틸_XGBClassifier.pkl


In [9]:
df_result = test_result(3,0.1)
df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
40,노루페인트,XGBClassifier,0.694215,0.6,0.153846,0.573014,./data/machine_model3_3개월_0.1/노루페인트_XGBClassifier.pkl
43,대성홀딩스,CatBoostClassifier,0.14876,0.65,0.119266,0.26682,./data/machine_model3_3개월_0.1/대성홀딩스_CatBoostClassifier.pkl
44,대성홀딩스,RandomForestClassifier,0.14876,0.65,0.119266,0.279052,./data/machine_model3_3개월_0.1/대성홀딩스_RandomForestClassifier.pkl
45,대성홀딩스,XGBClassifier,0.14876,0.65,0.119266,0.276758,./data/machine_model3_3개월_0.1/대성홀딩스_XGBClassifier.pkl
67,두산,RandomForestClassifier,0.53719,0.509804,0.45614,0.543037,./data/machine_model3_3개월_0.1/두산_RandomForestClassifier.pkl
68,두산,XGBClassifier,0.578512,0.578947,0.385965,0.635417,./data/machine_model3_3개월_0.1/두산_XGBClassifier.pkl
198,한국항공우주,XGBClassifier,0.669421,0.583333,0.166667,0.463532,./data/machine_model3_3개월_0.1/한국항공우주_XGBClassifier.pkl
220,한화에어로스페이스,CatBoostClassifier,0.735537,0.6,0.176471,0.534145,./data/machine_model3_3개월_0.1/한화에어로스페이스_CatBoostClassifier.pkl
221,한화에어로스페이스,RandomForestClassifier,0.743802,0.636364,0.205882,0.514875,./data/machine_model3_3개월_0.1/한화에어로스페이스_RandomForestClassifier.pkl


In [10]:
df_result.to_csv(f'./data/model_result_test/machine_model3_63일_0.1.csv')

## 3개월 15%

In [11]:
result_3_15 = pd.read_csv('./data/model_result_val/machine_model3_63일_0.15.csv',index_col=0)
result_3_15

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
0,AJ네트웍스,RandomForestClassifier,0.475806,0.666667,0.030303,0.416928,./data/machine_model3_3개월_0.15/AJ네트웍스_RandomForestClassifier.pkl
1,DB,XGBClassifier,0.524194,0.508475,0.500000,0.525000,./data/machine_model3_3개월_0.15/DB_XGBClassifier.pkl
2,DSR,CatBoostClassifier,0.435484,0.666667,0.028169,0.531358,./data/machine_model3_3개월_0.15/DSR_CatBoostClassifier.pkl
3,E1,CatBoostClassifier,0.475806,0.800000,0.058824,0.453782,./data/machine_model3_3개월_0.15/E1_CatBoostClassifier.pkl
4,GS글로벌,CatBoostClassifier,0.588710,0.666667,0.075472,0.632740,./data/machine_model3_3개월_0.15/GS글로벌_CatBoostClassifier.pkl
...,...,...,...,...,...,...,...
129,효성,RandomForestClassifier,0.379032,0.789474,0.170455,0.556503,./data/machine_model3_3개월_0.15/효성_RandomForestClassifier.pkl
130,효성,XGBClassifier,0.354839,0.833333,0.113636,0.406408,./data/machine_model3_3개월_0.15/효성_XGBClassifier.pkl
131,효성,CatBoostClassifier,0.298387,0.666667,0.022727,0.420297,./data/machine_model3_3개월_0.15/효성_CatBoostClassifier.pkl
132,휴스틸,XGBClassifier,0.338710,0.714286,0.058824,0.497436,./data/machine_model3_3개월_0.15/휴스틸_XGBClassifier.pkl


In [12]:
df_result = test_result(3,0.15)
df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
11,LG이노텍,CatBoostClassifier,0.371901,0.75,0.142857,0.457046,./data/machine_model3_3개월_0.15/LG이노텍_CatBoostClassifier.pkl
12,LG이노텍,RandomForestClassifier,0.380165,0.8,0.142857,0.456725,./data/machine_model3_3개월_0.15/LG이노텍_RandomForestClassifier.pkl
13,LG이노텍,XGBClassifier,0.371901,0.681818,0.178571,0.48713,./data/machine_model3_3개월_0.15/LG이노텍_XGBClassifier.pkl
14,LIG넥스원,XGBClassifier,0.347107,0.75,0.037037,0.560185,./data/machine_model3_3개월_0.15/LIG넥스원_XGBClassifier.pkl
23,TCC스틸,XGBClassifier,0.330579,0.571429,0.04878,0.43621,./data/machine_model3_3개월_0.15/TCC스틸_XGBClassifier.pkl
32,대웅제약,RandomForestClassifier,0.876033,0.666667,0.125,0.607143,./data/machine_model3_3개월_0.15/대웅제약_RandomForestClassifier.pkl
45,두산,CatBoostClassifier,0.603306,0.666667,0.040816,0.480159,./data/machine_model3_3개월_0.15/두산_CatBoostClassifier.pkl
46,두산,RandomForestClassifier,0.619835,0.588235,0.204082,0.486678,./data/machine_model3_3개월_0.15/두산_RandomForestClassifier.pkl
47,두산,XGBClassifier,0.619835,0.565217,0.265306,0.60941,./data/machine_model3_3개월_0.15/두산_XGBClassifier.pkl
51,디티알오토모티브,CatBoostClassifier,0.173554,0.666667,0.019802,0.491584,./data/machine_model3_3개월_0.15/디티알오토모티브_CatBoostClassifier.pkl


In [13]:
df_result.to_csv(f'./data/model_result_test/machine_model3_63일_0.15.csv')

## 6개월 5%

In [14]:
result_6_5 = pd.read_csv('./data/model_result_val/machine_model3_126일_0.05.csv',index_col=0)
result_6_5

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
0,BGF,RandomForestClassifier,0.583333,0.555556,0.535714,0.641741,./data/machine_model3_6개월_0.05/BGF_RandomForestClassifier.pkl
1,BGF,XGBClassifier,0.650000,0.666667,0.500000,0.583705,./data/machine_model3_6개월_0.05/BGF_XGBClassifier.pkl
2,BGF,CatBoostClassifier,0.616667,0.600000,0.535714,0.615513,./data/machine_model3_6개월_0.05/BGF_CatBoostClassifier.pkl
3,CJ CGV,RandomForestClassifier,0.483333,0.703704,0.452381,0.517857,./data/machine_model3_6개월_0.05/CJ CGV_RandomForestClassifier.pkl
4,CJ CGV,XGBClassifier,0.483333,0.739130,0.404762,0.650794,./data/machine_model3_6개월_0.05/CJ CGV_XGBClassifier.pkl
...,...,...,...,...,...,...,...
297,효성ITX,XGBClassifier,0.450000,0.666667,0.058824,0.484163,./data/machine_model3_6개월_0.05/효성ITX_XGBClassifier.pkl
298,효성ITX,CatBoostClassifier,0.466667,0.750000,0.088235,0.514706,./data/machine_model3_6개월_0.05/효성ITX_CatBoostClassifier.pkl
299,후성,RandomForestClassifier,0.716667,0.829268,0.772727,0.784801,./data/machine_model3_6개월_0.05/후성_RandomForestClassifier.pkl
300,후성,XGBClassifier,0.766667,0.916667,0.750000,0.803267,./data/machine_model3_6개월_0.05/후성_XGBClassifier.pkl


In [15]:
df_result = test_result(6,0.05)
df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
46,TCC스틸,CatBoostClassifier,0.206612,0.96,0.201681,0.317227,./data/machine_model3_6개월_0.05/TCC스틸_CatBoostClassifier.pkl
47,TCC스틸,RandomForestClassifier,0.264463,0.96875,0.260504,0.338235,./data/machine_model3_6개월_0.05/TCC스틸_RandomForestClassifier.pkl
48,TCC스틸,XGBClassifier,0.264463,0.96875,0.260504,0.321429,./data/machine_model3_6개월_0.05/TCC스틸_XGBClassifier.pkl
67,고려아연,CatBoostClassifier,0.231405,0.666667,0.082474,0.452749,./data/machine_model3_6개월_0.05/고려아연_CatBoostClassifier.pkl
68,고려아연,RandomForestClassifier,0.305785,0.809524,0.175258,0.408505,./data/machine_model3_6개월_0.05/고려아연_RandomForestClassifier.pkl
69,고려아연,XGBClassifier,0.289256,0.73913,0.175258,0.462414,./data/machine_model3_6개월_0.05/고려아연_XGBClassifier.pkl
102,대한방직,CatBoostClassifier,0.958678,0.966667,0.991453,0.379274,./data/machine_model3_6개월_0.05/대한방직_CatBoostClassifier.pkl
103,대한방직,RandomForestClassifier,0.950413,0.966387,0.982906,0.377137,./data/machine_model3_6개월_0.05/대한방직_RandomForestClassifier.pkl
104,대한방직,XGBClassifier,0.950413,0.966387,0.982906,0.38141,./data/machine_model3_6개월_0.05/대한방직_XGBClassifier.pkl
112,동부건설,CatBoostClassifier,0.636364,0.666667,0.580645,0.685621,./data/machine_model3_6개월_0.05/동부건설_CatBoostClassifier.pkl


In [16]:
df_result.to_csv(f'./data/model_result_test/machine_model3_126일_0.05.csv')

## 6개월 10%

In [37]:
result_6_10 = pd.read_csv('./data/model_result_val/machine_model3_126일_0.1.csv',index_col=0)
result_6_10

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
0,BYC,RandomForestClassifier,0.133333,0.833333,0.089286,0.294643,./data/machine_model3_6개월_0.1/BYC_RandomForestClassifier.pkl
1,BYC,CatBoostClassifier,0.116667,0.714286,0.089286,0.294643,./data/machine_model3_6개월_0.1/BYC_CatBoostClassifier.pkl
2,DB,XGBClassifier,0.833333,0.980392,0.847458,0.830508,./data/machine_model3_6개월_0.1/DB_XGBClassifier.pkl
3,DI동일,RandomForestClassifier,0.416667,0.960000,0.413793,0.448276,./data/machine_model3_6개월_0.1/DI동일_RandomForestClassifier.pkl
4,DI동일,XGBClassifier,0.416667,0.960000,0.413793,0.387931,./data/machine_model3_6개월_0.1/DI동일_XGBClassifier.pkl
...,...,...,...,...,...,...,...
186,혜인,XGBClassifier,0.566667,0.714286,0.172414,0.633482,./data/machine_model3_6개월_0.1/혜인_XGBClassifier.pkl
187,혜인,CatBoostClassifier,0.533333,0.555556,0.172414,0.615128,./data/machine_model3_6개월_0.1/혜인_CatBoostClassifier.pkl
188,황금에스티,RandomForestClassifier,0.066667,0.666667,0.035088,0.327485,./data/machine_model3_6개월_0.1/황금에스티_RandomForestClassifier.pkl
189,황금에스티,CatBoostClassifier,0.066667,0.666667,0.035088,0.327485,./data/machine_model3_6개월_0.1/황금에스티_CatBoostClassifier.pkl


In [18]:
df_result = test_result(6,0.1)
df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
0,BYC,CatBoostClassifier,0.363636,0.583333,0.088608,0.475889,./data/machine_model3_6개월_0.1/BYC_CatBoostClassifier.pkl
37,TCC스틸,CatBoostClassifier,0.206612,0.956522,0.188034,0.331197,./data/machine_model3_6개월_0.1/TCC스틸_CatBoostClassifier.pkl
38,TCC스틸,RandomForestClassifier,0.206612,0.92,0.196581,0.339744,./data/machine_model3_6개월_0.1/TCC스틸_RandomForestClassifier.pkl
39,TCC스틸,XGBClassifier,0.190083,0.913043,0.179487,0.326923,./data/machine_model3_6개월_0.1/TCC스틸_XGBClassifier.pkl
77,동부건설,CatBoostClassifier,0.661157,0.588235,0.425532,0.589707,./data/machine_model3_6개월_0.1/동부건설_CatBoostClassifier.pkl
128,이수화학,CatBoostClassifier,0.487603,0.666667,0.031746,0.52942,./data/machine_model3_6개월_0.1/이수화학_CatBoostClassifier.pkl
129,이수화학,RandomForestClassifier,0.487603,0.571429,0.063492,0.532704,./data/machine_model3_6개월_0.1/이수화학_RandomForestClassifier.pkl
131,일진머티리얼즈,CatBoostClassifier,0.818182,0.951923,0.853448,0.282759,./data/machine_model3_6개월_0.1/일진머티리얼즈_CatBoostClassifier.pkl
132,일진머티리얼즈,RandomForestClassifier,0.818182,0.951923,0.853448,0.251724,./data/machine_model3_6개월_0.1/일진머티리얼즈_RandomForestClassifier.pkl
133,일진머티리얼즈,XGBClassifier,0.826446,0.952381,0.862069,0.324138,./data/machine_model3_6개월_0.1/일진머티리얼즈_XGBClassifier.pkl


In [19]:
df_result.to_csv(f'./data/model_result_test/machine_model3_126일_0.1.csv')

## 6개월 15%

In [22]:
result_6_15 = pd.read_csv('./data/model_result_val/machine_model3_126일_0.15.csv',index_col=0)
result_6_15

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
0,BYC,RandomForestClassifier,0.150000,0.571429,0.076923,0.352163,./data/machine_model3_6개월_0.15/BYC_RandomForestClassifier.pkl
1,BYC,CatBoostClassifier,0.150000,0.571429,0.076923,0.347356,./data/machine_model3_6개월_0.15/BYC_CatBoostClassifier.pkl
2,DB,RandomForestClassifier,0.833333,0.923077,0.888889,0.500000,./data/machine_model3_6개월_0.15/DB_RandomForestClassifier.pkl
3,DB,XGBClassifier,0.800000,0.920000,0.851852,0.537037,./data/machine_model3_6개월_0.15/DB_XGBClassifier.pkl
4,DB,CatBoostClassifier,0.833333,0.923077,0.888889,0.478395,./data/machine_model3_6개월_0.15/DB_CatBoostClassifier.pkl
...,...,...,...,...,...,...,...
155,혜인,RandomForestClassifier,0.550000,0.538462,0.250000,0.559152,./data/machine_model3_6개월_0.15/혜인_RandomForestClassifier.pkl
156,혜인,XGBClassifier,0.583333,0.714286,0.178571,0.666295,./data/machine_model3_6개월_0.15/혜인_XGBClassifier.pkl
157,혜인,CatBoostClassifier,0.550000,0.555556,0.178571,0.608259,./data/machine_model3_6개월_0.15/혜인_CatBoostClassifier.pkl
158,황금에스티,RandomForestClassifier,0.133333,0.666667,0.037736,0.566038,./data/machine_model3_6개월_0.15/황금에스티_RandomForestClassifier.pkl


In [23]:
df_result = test_result(6,0.15)
df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
37,TCC스틸,CatBoostClassifier,0.198347,0.952381,0.172414,0.293103,./data/machine_model3_6개월_0.15/TCC스틸_CatBoostClassifier.pkl
38,TCC스틸,RandomForestClassifier,0.206612,0.916667,0.189655,0.289655,./data/machine_model3_6개월_0.15/TCC스틸_RandomForestClassifier.pkl
39,TCC스틸,XGBClassifier,0.173554,0.863636,0.163793,0.256897,./data/machine_model3_6개월_0.15/TCC스틸_XGBClassifier.pkl
112,이수화학,RandomForestClassifier,0.504132,0.571429,0.065574,0.509973,./data/machine_model3_6개월_0.15/이수화학_RandomForestClassifier.pkl
116,일진머티리얼즈,CatBoostClassifier,0.528926,0.859649,0.5,0.531943,./data/machine_model3_6개월_0.15/일진머티리얼즈_CatBoostClassifier.pkl
117,일진머티리얼즈,RandomForestClassifier,0.727273,0.803738,0.877551,0.565439,./data/machine_model3_6개월_0.15/일진머티리얼즈_RandomForestClassifier.pkl
118,일진머티리얼즈,XGBClassifier,0.702479,0.803922,0.836735,0.535936,./data/machine_model3_6개월_0.15/일진머티리얼즈_XGBClassifier.pkl
122,진양산업,CatBoostClassifier,0.14876,0.875,0.06422,0.753823,./data/machine_model3_6개월_0.15/진양산업_CatBoostClassifier.pkl
123,진양산업,RandomForestClassifier,0.14876,0.8,0.073394,0.691131,./data/machine_model3_6개월_0.15/진양산업_RandomForestClassifier.pkl
124,진양산업,XGBClassifier,0.140496,0.777778,0.06422,0.758028,./data/machine_model3_6개월_0.15/진양산업_XGBClassifier.pkl


In [24]:
df_result.to_csv(f'./data/model_result_test/machine_model3_126일_0.15.csv')

# 3월 30일 투자(시현용)

In [12]:
df_result = pd.read_csv('./data/model_result_test/machine_model3_63일_0.05.csv',index_col=0)
df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
df_result = df_result.sort_values(by='precision',ascending=False)
df_result.drop_duplicates(subset='회사이름',inplace=True)
df_result

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
43,LG이노텍,RandomForestClassifier,0.545455,0.76,0.606383,0.500197,./data/machine_model3_3개월_0.05/LG이노텍_RandomForestClassifier.pkl
199,사조동아원,XGBClassifier,0.809917,0.75,0.12,0.627917,./data/machine_model3_3개월_0.05/사조동아원_XGBClassifier.pkl
96,광전자,CatBoostClassifier,0.545455,0.75,0.101695,0.528431,./data/machine_model3_3개월_0.05/광전자_CatBoostClassifier.pkl
322,진양산업,XGBClassifier,0.115702,0.714286,0.045455,0.440909,./data/machine_model3_3개월_0.05/진양산업_XGBClassifier.pkl
407,한화에어로스페이스,CatBoostClassifier,0.661157,0.666667,0.177778,0.466959,./data/machine_model3_3개월_0.05/한화에어로스페이스_CatBoostClassifier.pkl
187,백산,XGBClassifier,0.545455,0.666667,0.135593,0.54743,./data/machine_model3_3개월_0.05/백산_XGBClassifier.pkl
166,두산,XGBClassifier,0.586777,0.636364,0.538462,0.588736,./data/machine_model3_3개월_0.05/두산_XGBClassifier.pkl
81,TCC스틸,XGBClassifier,0.305785,0.619048,0.146067,0.459621,./data/machine_model3_3개월_0.05/TCC스틸_XGBClassifier.pkl
8,DB하이텍,CatBoostClassifier,0.603306,0.605769,0.9,0.477871,./data/machine_model3_3개월_0.05/DB하이텍_CatBoostClassifier.pkl
304,일신방직,CatBoostClassifier,0.46281,0.6,0.045455,0.520937,./data/machine_model3_3개월_0.05/일신방직_CatBoostClassifier.pkl


## 투자 종목 반환 함수: text_invest_result

In [6]:
def text_invest_result(month,period_rate,date):
    
    window_size = month *21
    df_result = pd.read_csv(f'./data/model_result_test/machine_model3_{window_size}일_{period_rate}.csv',index_col=0)
    df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
    df_result = df_result.sort_values(by='precision',ascending=False)
    df_result.drop_duplicates(subset='회사이름',inplace=True)

    # 키워드 데이터
    path = './data/데이터_뉴스키워드빈도/'

    models_path = f'./data/machine_model3_{month}개월_{period_rate}/'
    saved_model_list = os.listdir(models_path)

    a=''
    invest_lst = []
    for corp_name in df_result['회사이름'].tolist():

        code = corp_code(corp_name)
        df_p = stock_price(code)

        file_path = os.path.join(path,corp_name+'.csv')
        df_count = pd.read_csv(file_path,index_col=0)
        df_count.index = pd.DatetimeIndex(df_count.index)
        df_count = mscaler(df_count)
        last_col = df_count.columns[-1]

        df_merge = merge(df_count,df_p)
        # 특정날짜 모델에 넣을 데이터
        x_invest = np.array(df_merge.loc[date, :last_col]).reshape(1,-1)

        model_path = df_result[df_result['회사이름']==corp_name].iloc[0,-1]
        #모델 불러오기
        model = joblib.load(model_path)
        pred = model.predict(x_invest)[0]

        if pred ==1:
            a += corp_name + '\n'
            invest_lst.append(corp_name)

    return a,invest_lst

### 3개월 5%

In [16]:
text_invest_result(3,0.05,'2022-03-30')

('LG이노텍\n롯데칠성\n영원무역\nHSD엔진\n', ['LG이노텍', '롯데칠성', '영원무역', 'HSD엔진'])

In [79]:
code = corp_code('LG이노텍')

In [None]:
df_p = fdr.DataReader(code,'2022-03-30','2022-06')

In [6]:
df_result = pd.read_csv(f'./data/model_result_test/machine_model3_63일_0.05.csv',index_col=0)
df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
df_result = df_result.sort_values(by='precision',ascending=False)
df_result.drop_duplicates(subset='회사이름',inplace=True)
df_result

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
43,LG이노텍,RandomForestClassifier,0.545455,0.76,0.606383,0.500197,./data/machine_model3_3개월_0.05/LG이노텍_RandomForestClassifier.pkl
199,사조동아원,XGBClassifier,0.809917,0.75,0.12,0.627917,./data/machine_model3_3개월_0.05/사조동아원_XGBClassifier.pkl
96,광전자,CatBoostClassifier,0.545455,0.75,0.101695,0.528431,./data/machine_model3_3개월_0.05/광전자_CatBoostClassifier.pkl
322,진양산업,XGBClassifier,0.115702,0.714286,0.045455,0.440909,./data/machine_model3_3개월_0.05/진양산업_XGBClassifier.pkl
407,한화에어로스페이스,CatBoostClassifier,0.661157,0.666667,0.177778,0.466959,./data/machine_model3_3개월_0.05/한화에어로스페이스_CatBoostClassifier.pkl
187,백산,XGBClassifier,0.545455,0.666667,0.135593,0.54743,./data/machine_model3_3개월_0.05/백산_XGBClassifier.pkl
166,두산,XGBClassifier,0.586777,0.636364,0.538462,0.588736,./data/machine_model3_3개월_0.05/두산_XGBClassifier.pkl
81,TCC스틸,XGBClassifier,0.305785,0.619048,0.146067,0.459621,./data/machine_model3_3개월_0.05/TCC스틸_XGBClassifier.pkl
8,DB하이텍,CatBoostClassifier,0.603306,0.605769,0.9,0.477871,./data/machine_model3_3개월_0.05/DB하이텍_CatBoostClassifier.pkl
304,일신방직,CatBoostClassifier,0.46281,0.6,0.045455,0.520937,./data/machine_model3_3개월_0.05/일신방직_CatBoostClassifier.pkl


### 3개월 10%

In [73]:
text_invest_result(3,0.1,'2022-03-30')

'대성홀딩스\n'

In [7]:
df_result = pd.read_csv(f'./data/model_result_test/machine_model3_63일_0.1.csv',index_col=0)
df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
df_result = df_result.sort_values(by='precision',ascending=False)
df_result.drop_duplicates(subset='회사이름',inplace=True)
df_result

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
43,대성홀딩스,CatBoostClassifier,0.14876,0.65,0.119266,0.26682,./data/machine_model3_3개월_0.1/대성홀딩스_CatBoostClassifier.pkl
221,한화에어로스페이스,RandomForestClassifier,0.743802,0.636364,0.205882,0.514875,./data/machine_model3_3개월_0.1/한화에어로스페이스_RandomForestClassifier.pkl
40,노루페인트,XGBClassifier,0.694215,0.6,0.153846,0.573014,./data/machine_model3_3개월_0.1/노루페인트_XGBClassifier.pkl
198,한국항공우주,XGBClassifier,0.669421,0.583333,0.166667,0.463532,./data/machine_model3_3개월_0.1/한국항공우주_XGBClassifier.pkl
68,두산,XGBClassifier,0.578512,0.578947,0.385965,0.635417,./data/machine_model3_3개월_0.1/두산_XGBClassifier.pkl


### 3개월 15%

In [17]:
df_result = pd.read_csv(f'./data/model_result_test/machine_model3_63일_0.15.csv',index_col=0)
df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
df_result = df_result.sort_values(by='precision',ascending=False)
df_result.drop_duplicates(subset='회사이름',inplace=True)
df_result

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
112,코리아써키트,CatBoostClassifier,0.297521,0.909091,0.106383,0.454492,./data/machine_model3_3개월_0.15/코리아써키트_CatBoostClassifier.pkl
145,후성,CatBoostClassifier,0.628099,0.833333,0.102041,0.592404,./data/machine_model3_3개월_0.15/후성_CatBoostClassifier.pkl
12,LG이노텍,RandomForestClassifier,0.380165,0.8,0.142857,0.456725,./data/machine_model3_3개월_0.15/LG이노텍_RandomForestClassifier.pkl
14,LIG넥스원,XGBClassifier,0.347107,0.75,0.037037,0.560185,./data/machine_model3_3개월_0.15/LIG넥스원_XGBClassifier.pkl
32,대웅제약,RandomForestClassifier,0.876033,0.666667,0.125,0.607143,./data/machine_model3_3개월_0.15/대웅제약_RandomForestClassifier.pkl
45,두산,CatBoostClassifier,0.603306,0.666667,0.040816,0.480159,./data/machine_model3_3개월_0.15/두산_CatBoostClassifier.pkl
51,디티알오토모티브,CatBoostClassifier,0.173554,0.666667,0.019802,0.491584,./data/machine_model3_3개월_0.15/디티알오토모티브_CatBoostClassifier.pkl
89,아세아시멘트,XGBClassifier,0.702479,0.666667,0.054054,0.534266,./data/machine_model3_3개월_0.15/아세아시멘트_XGBClassifier.pkl
23,TCC스틸,XGBClassifier,0.330579,0.571429,0.04878,0.43621,./data/machine_model3_3개월_0.15/TCC스틸_XGBClassifier.pkl


In [74]:
text_invest_result(3,0.15,'2022-03-30')

'LG이노텍\n'

### 6개월 5%

In [75]:
text_invest_result(6,0.05,'2022-03-30')

'일진머티리얼즈\n후성\n대한방직\n진양산업\n한국가스공사\n한세실업\n'

In [8]:
df_result = pd.read_csv(f'./data/model_result_test/machine_model3_126일_0.05.csv',index_col=0)
df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
df_result = df_result.sort_values(by='precision',ascending=False)
df_result.drop_duplicates(subset='회사이름',inplace=True)
df_result

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
197,일진머티리얼즈,CatBoostClassifier,0.900826,0.990909,0.908333,0.366667,./data/machine_model3_6개월_0.05/일진머티리얼즈_CatBoostClassifier.pkl
300,후성,RandomForestClassifier,0.661157,0.973333,0.651786,0.818452,./data/machine_model3_6개월_0.05/후성_RandomForestClassifier.pkl
47,TCC스틸,RandomForestClassifier,0.264463,0.96875,0.260504,0.338235,./data/machine_model3_6개월_0.05/TCC스틸_RandomForestClassifier.pkl
102,대한방직,CatBoostClassifier,0.958678,0.966667,0.991453,0.379274,./data/machine_model3_6개월_0.05/대한방직_CatBoostClassifier.pkl
209,진양산업,RandomForestClassifier,0.190083,0.944444,0.149123,0.625313,./data/machine_model3_6개월_0.05/진양산업_RandomForestClassifier.pkl
68,고려아연,RandomForestClassifier,0.305785,0.809524,0.175258,0.408505,./data/machine_model3_6개월_0.05/고려아연_RandomForestClassifier.pkl
196,이수화학,CatBoostClassifier,0.413223,0.7,0.093333,0.547536,./data/machine_model3_6개월_0.05/이수화학_CatBoostClassifier.pkl
112,동부건설,CatBoostClassifier,0.636364,0.666667,0.580645,0.685621,./data/machine_model3_6개월_0.05/동부건설_CatBoostClassifier.pkl
233,한국가스공사,RandomForestClassifier,0.512397,0.625,0.362319,0.532051,./data/machine_model3_6개월_0.05/한국가스공사_RandomForestClassifier.pkl
255,한세실업,RandomForestClassifier,0.528926,0.592593,0.258065,0.581875,./data/machine_model3_6개월_0.05/한세실업_RandomForestClassifier.pkl


### 6개월 10%

In [76]:
text_invest_result(6,0.1,'2022-03-30')

'진양산업\n'

In [9]:
df_result = pd.read_csv(f'./data/model_result_test/machine_model3_126일_0.1.csv',index_col=0)
df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
df_result = df_result.sort_values(by='precision',ascending=False)
df_result.drop_duplicates(subset='회사이름',inplace=True)
df_result

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
37,TCC스틸,CatBoostClassifier,0.206612,0.956522,0.188034,0.331197,./data/machine_model3_6개월_0.1/TCC스틸_CatBoostClassifier.pkl
133,일진머티리얼즈,XGBClassifier,0.826446,0.952381,0.862069,0.324138,./data/machine_model3_6개월_0.1/일진머티리얼즈_XGBClassifier.pkl
141,진양산업,RandomForestClassifier,0.165289,0.923077,0.107143,0.797619,./data/machine_model3_6개월_0.1/진양산업_RandomForestClassifier.pkl
190,후성,XGBClassifier,0.132231,0.833333,0.045872,0.574541,./data/machine_model3_6개월_0.1/후성_XGBClassifier.pkl
128,이수화학,CatBoostClassifier,0.487603,0.666667,0.031746,0.52942,./data/machine_model3_6개월_0.1/이수화학_CatBoostClassifier.pkl
77,동부건설,CatBoostClassifier,0.661157,0.588235,0.425532,0.589707,./data/machine_model3_6개월_0.1/동부건설_CatBoostClassifier.pkl
0,BYC,CatBoostClassifier,0.363636,0.583333,0.088608,0.475889,./data/machine_model3_6개월_0.1/BYC_CatBoostClassifier.pkl


### 6개월 15%

In [18]:
df_result = pd.read_csv(f'./data/model_result_test/machine_model3_126일_0.15.csv',index_col=0)
df_result = df_result[(df_result['precision']>0.5) &(df_result['precision'] != 1)]
df_result = df_result.sort_values(by='precision',ascending=False)
df_result.drop_duplicates(subset='회사이름',inplace=True)
df_result

Unnamed: 0,회사이름,모델이름,accuracy,precision,recall,roc_auc,모델주소
37,TCC스틸,CatBoostClassifier,0.198347,0.952381,0.172414,0.293103,./data/machine_model3_6개월_0.15/TCC스틸_CatBoostClassifier.pkl
122,진양산업,CatBoostClassifier,0.14876,0.875,0.06422,0.753823,./data/machine_model3_6개월_0.15/진양산업_CatBoostClassifier.pkl
116,일진머티리얼즈,CatBoostClassifier,0.528926,0.859649,0.5,0.531943,./data/machine_model3_6개월_0.15/일진머티리얼즈_CatBoostClassifier.pkl
112,이수화학,RandomForestClassifier,0.504132,0.571429,0.065574,0.509973,./data/machine_model3_6개월_0.15/이수화학_RandomForestClassifier.pkl


In [77]:
text_invest_result(6,0.15,'2022-03-30')

'진양산업\n일진머티리얼즈\n'

# 모델 성능 확인(2022-01-03)

In [28]:
text_invest_result(3,0.05,'2022-01-05')

('한화에어로스페이스\n두산\nTCC스틸\nDB하이텍\n신원\n일진머티리얼즈\n롯데칠성\nHSD엔진\n',
 ['한화에어로스페이스', '두산', 'TCC스틸', 'DB하이텍', '신원', '일진머티리얼즈', '롯데칠성', 'HSD엔진'])

In [27]:
text_invest_result(3,0.1,'2022-01-05')

('한화에어로스페이스\n두산\n', ['한화에어로스페이스', '두산'])

In [88]:
code = corp_code('LG이노텍')
df_p = fdr.DataReader(code,'2022-03-15','2022-06-30')
df_p = df_p.iloc[0:63,:]
df_p = df_p.iloc[0:63,:]['Close'].to_frame()
df_p


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-03-15,341000
2022-03-16,362500
2022-03-17,370000
2022-03-18,382000
2022-03-21,405000
...,...
2022-06-08,377500
2022-06-09,376500
2022-06-10,378000
2022-06-13,372500


## check_with_BM()

In [7]:
def check_with_BM(month,period_rate,bgn_date):
    
    _,corp_lst = text_invest_result(month,period_rate,bgn_date)
    
    invest_lst1 = []
    change_lst = []
    for corp in corp_lst:
        # 회사이름
        invest_lst1.append(corp)
        # 회사코드
        code = corp_code(corp)
        
        df_index = fdr.DataReader('KS11',bgn_date)['Close'].to_frame()
        df_index3 = df_index.loc[bgn_date:][:63]
        df_index6 = df_index.loc[bgn_date:][:126]
        
        df_p = fdr.DataReader(code,bgn_date)
        
        try:
            if month ==3:
                df_p = df_p.iloc[0:63,:]['Close'].to_frame()
                sell_date = df_p.index[-1]
#                 print('매도날짜: ',sell_date)
                change = (df_p.iloc[-1,0]-df_p.iloc[0,0])/df_p.iloc[0,0]
                change_lst.append(change)
                BM3 = ((df_index3.iloc[-1]-df_index3.iloc[0])/df_index3.iloc[0]).iloc[0]

                df_BM = pd.DataFrame({'종목명':invest_lst1,'수익률':change_lst})
                df_BM['평균수익률'] = np.mean(df_BM['수익률'])
                df_BM['BM3'] = BM3 
                df_BM['BM대비 수익률'] = df_BM['평균수익률'] - df_BM['BM3']

            else:
                df_p = df_p.iloc[0:126,:]['Close'].to_frame()
                sell_date = df_p.index[-1]
#                 print('매도날짜: ',sell_date)
                change = (df_p.iloc[-1,0]-df_p.iloc[0,0])/df_p.iloc[0,0]
                change_lst.append(change)
                BM6 = ((df_index6.iloc[-1]-df_index6.iloc[0])/df_index6.iloc[0]).iloc[0]

                df_BM = pd.DataFrame({'종목명':invest_lst1,'수익률':change_lst})
                df_BM['평균수익률'] = np.mean(df_BM['수익률'])
                df_BM['BM6'] = BM6 
                df_BM['BM대비 수익률'] = df_BM['평균수익률'] - df_BM['BM6']
                
        except:
            print('선출된 종목이 없습니다.')
            
    
    return df_BM


In [62]:
text_invest_result(3,0.05,'2022-01-05')

('한화에어로스페이스\n두산\nTCC스틸\nDB하이텍\n신원\n일진머티리얼즈\n롯데칠성\nHSD엔진\n',
 ['한화에어로스페이스', '두산', 'TCC스틸', 'DB하이텍', '신원', '일진머티리얼즈', '롯데칠성', 'HSD엔진'])

In [8]:
check_with_BM(3,0.05,'2022-01-05')

Unnamed: 0,종목명,수익률,평균수익률,BM3,BM대비 수익률
0,한화에어로스페이스,-0.003929,-0.038281,-0.085844,0.047563
1,두산,-0.198374,-0.038281,-0.085844,0.047563
2,TCC스틸,0.009174,-0.038281,-0.085844,0.047563
3,DB하이텍,-0.063172,-0.038281,-0.085844,0.047563
4,신원,-0.255556,-0.038281,-0.085844,0.047563
5,일진머티리얼즈,-0.165992,-0.038281,-0.085844,0.047563
6,롯데칠성,0.362595,-0.038281,-0.085844,0.047563
7,HSD엔진,0.009009,-0.038281,-0.085844,0.047563


In [13]:
check_with_BM(6,0.05,'2022-01-05')

Unnamed: 0,종목명,수익률,평균수익률,BM6,BM대비 수익률
0,일진머티리얼즈,-0.365182,-0.092445,-0.198841,0.106396
1,TCC스틸,0.114679,-0.092445,-0.198841,0.106396
2,대한방직,-0.003099,-0.092445,-0.198841,0.106396
3,영원무역,-0.116178,-0.092445,-0.198841,0.106396


In [14]:
check_with_BM(3,0.1,'2022-01-05')

Unnamed: 0,종목명,수익률,평균수익률,BM3,BM대비 수익률
0,한화에어로스페이스,-0.003929,-0.101152,-0.085844,-0.015308
1,두산,-0.198374,-0.101152,-0.085844,-0.015308


In [15]:
check_with_BM(6,0.1,'2022-01-05')

Unnamed: 0,종목명,수익률,평균수익률,BM6,BM대비 수익률
0,TCC스틸,0.114679,-0.125252,-0.198841,0.073589
1,일진머티리얼즈,-0.365182,-0.125252,-0.198841,0.073589


In [12]:
text_invest_result(3,0.15,'2022-01-03')

('TCC스틸\n', ['TCC스틸'])

In [13]:
check_with_BM(3,0.15,'2022-01-03')

Unnamed: 0,종목명,수익률,평균수익률,BM3,BM대비 수익률
0,TCC스틸,0.014423,0.014423,-0.084898,0.099321


In [14]:
check_with_BM(6,0.15,'2022-01-03')

Unnamed: 0,종목명,수익률,평균수익률,BM6,BM대비 수익률
0,TCC스틸,0.100962,0.100962,-0.225661,0.326623


In [8]:
check_with_BM(3,0.15,'2021-12-30')

Unnamed: 0,종목명,수익률,평균수익률,BM3,BM대비 수익률
0,아세아시멘트,0.413934,0.413934,-0.073363,0.487298


In [25]:
check_with_BM(6,0.15,'2021-12-30')

매도날짜:  2022-06-23 00:00:00
매도날짜:  2022-06-23 00:00:00


Unnamed: 0,종목명,수익률,평균수익률,BM6,BM대비 수익률
0,TCC스틸,0.182851,-0.149315,-0.22277,0.073455
1,일진머티리얼즈,-0.481481,-0.149315,-0.22277,0.073455


## 챗봇 추천 성능

In [9]:
check_with_BM(3,0.15,'2021-12-29')

Unnamed: 0,종목명,수익률,평균수익률,BM3,BM대비 수익률
0,코리아써키트,-0.003236,0.159515,-0.078639,0.238154
1,LIG넥스원,0.067847,0.159515,-0.078639,0.238154
2,아세아시멘트,0.413934,0.159515,-0.078639,0.238154


In [10]:
check_with_BM(6,0.15,'2021-12-29')

Unnamed: 0,종목명,수익률,평균수익률,BM6,BM대비 수익률
0,TCC스틸,0.366704,0.366704,-0.209365,0.576069


In [28]:
# 현대모비스,sk하이닉스
# 유니온,한진
# 한국전자홀딩스,자화전자

In [55]:
bgn_date = '2022-01-05'
df_index = fdr.DataReader('KS11','2022-01-05')['Close'].to_frame()
df_index3 = df_index.loc[bgn_date:][:63]
df_index6 = df_index.loc[bgn_date:][:126]
BM3 = ((df_index3.iloc[-1]-df_index3.iloc[0])/df_index3.iloc[0]).iloc[0]
BM6 = (df_index6.iloc[-1]-df_index6.iloc[0])/df_index6.iloc[0]
print(BM3)
BM6

-0.08584379665331739


Close   -0.216539
dtype: float64

In [56]:
df_index3['BM3'] = BM3

In [60]:
np.mean(df_index3['Close'])

2752.8026984126973

# 퀀트 모델 성능

In [None]:
bgn_date =
end_date =

df_index = fdr.DataReader('KS11',bgn_date,end_date)['Close'].to_frame()
corp_lst = []

for corp in corp_lst:
    code = corp_code(corp)
    df_p = fdr.DataReader(code,bgn_date,end_date)
    df_p = df_p.iloc[bgn_date:,:]['Close'].to_frame()
    # sell_date = df_p.index[-1]
    change = (df_p.iloc[-1,0]-df_p.iloc[0,0])/df_p.iloc[0,0]
    change_lst.append(change)
    BM = ((df_index.iloc[-1]-df_index.iloc[0])/df_index.iloc[0]).iloc[0]

    df_BM = pd.DataFrame({'종목명':corp_lst,'수익률':change_lst})
    df_BM['평균수익률'] = np.mean(df_BM['수익률'])
    df_BM['BM'] = BM 
    df_BM['BM대비 수익률'] = df_BM['평균수익률'] - df_BM['BM']

In [37]:
def BM(bgn_date,end_date,corp_lst):
    df_index = fdr.DataReader('KS11',bgn_date,end_date)['Close'].to_frame()
    
    change_lst = []
    
    for corp in corp_lst:
        code = corp_code(corp)
        df_p = fdr.DataReader(code,bgn_date,end_date)
        df_p = df_p.loc[bgn_date:end_date,:]['Close'].to_frame()
        # sell_date = df_p.index[-1]
        change = (df_p.iloc[-1,0]-df_p.iloc[0,0])/df_p.iloc[0,0]
        change_lst.append(change)
        
        BM = ((df_index.iloc[-1]-df_index.iloc[0])/df_index.iloc[0]).iloc[0]

    df_BM = pd.DataFrame({'종목명':corp_lst,'수익률':change_lst})
    df_BM['평균수익률'] = np.mean(df_BM['수익률'])
    df_BM['BM'] = BM 
    df_BM['BM대비 수익률'] = df_BM['평균수익률'] - df_BM['BM']

    return df_BM


In [39]:
# 볼린져 밴드
corp_lst = ['조선내화','KISCO홀딩스','삼영무역','쌍용C&E','롯데정밀화학','효성','풍산홀딩스']
df= BM('2022-01-03','2022-04-03',corp_lst)
df

# 볼린져 밴드

# 평균수익률: 5.51% 
# 코스피수익률 : -8.32%
# BM 대비 수익률: 13.83%

Unnamed: 0,종목명,수익률,평균수익률,BM,BM대비 수익률
0,조선내화,-0.004944,0.055102,-0.083285,0.138387
1,KISCO홀딩스,0.217993,0.055102,-0.083285,0.138387
2,삼영무역,-0.006944,0.055102,-0.083285,0.138387
3,쌍용C&E,0.059818,0.055102,-0.083285,0.138387
4,롯데정밀화학,0.134247,0.055102,-0.083285,0.138387
5,효성,-0.080645,0.055102,-0.083285,0.138387
6,풍산홀딩스,0.06619,0.055102,-0.083285,0.138387


In [40]:
# 마법의 공식
corp_lst =['LX인터내셔널','한솔홀딩스','깨끗한나라','한솔테크닉스','신송홀딩스','미래아이앤지']
df= BM('2022-01-03','2022-04-03',corp_lst)
df

# 마법의 공식

# 평균수익률: 9.51% 
# 코스피수익률 : -8.32%
# BM 대비 수익률: 17.83%

Unnamed: 0,종목명,수익률,평균수익률,BM,BM대비 수익률
0,LX인터내셔널,0.318182,0.095113,-0.083285,0.178398
1,한솔홀딩스,0.016878,0.095113,-0.083285,0.178398
2,깨끗한나라,0.08701,0.095113,-0.083285,0.178398
3,한솔테크닉스,-0.118557,0.095113,-0.083285,0.178398
4,신송홀딩스,0.299424,0.095113,-0.083285,0.178398
5,미래아이앤지,-0.032258,0.095113,-0.083285,0.178398


In [41]:
# famma lsv
corp_lst = ['SUN&L','한국프랜지','케이비아이동국실업','세이브존I&C','무림페이퍼']
df= BM('2022-01-03','2022-04-03',corp_lst)
df

# fammal lsv

# 평균수익률: 7.66% 
# 코스피수익률 : -8.32%
# BM 대비 수익률: 13.54%

Unnamed: 0,종목명,수익률,평균수익률,BM,BM대비 수익률
0,SUN&L,0.076621,0.052189,-0.083285,0.135474
1,한국프랜지,0.095865,0.052189,-0.083285,0.135474
2,케이비아이동국실업,-0.043157,0.052189,-0.083285,0.135474
3,세이브존I&C,0.058923,0.052189,-0.083285,0.135474
4,무림페이퍼,0.072692,0.052189,-0.083285,0.135474


In [42]:
# 절대 모멘텀
corp_lst = ['롯데쇼핑','KPX케미칼','신세계푸드','KT&G','TKG휴켐스','애경케미칼']
df= BM('2022-01-03','2022-04-03',corp_lst)
df

# 절대 모멘텀

# 평균수익률: 1.31% 
# 코스피수익률 : -8.32%
# BM 대비 수익률: 9.63%

Unnamed: 0,종목명,수익률,평균수익률,BM,BM대비 수익률
0,롯데쇼핑,0.128472,0.013099,-0.083285,0.096384
1,KPX케미칼,0.018762,0.013099,-0.083285,0.096384
2,신세계푸드,-0.044118,0.013099,-0.083285,0.096384
3,KT&G,0.025316,0.013099,-0.083285,0.096384
4,TKG휴켐스,-0.02193,0.013099,-0.083285,0.096384
5,애경케미칼,-0.027907,0.013099,-0.083285,0.096384
