In [386]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm
import sklearn

In [387]:
stock=pd.read_csv('stocks.csv',index_col=0,encoding='cp949')
train=pd.read_csv('trade_train.csv',index_col=0,encoding='cp949')
answer=pd.read_csv('answer_sheet.csv',index_col=0,encoding='cp949')

In [388]:
def datainfo(df):
    return pd.DataFrame([(col,df[col].nunique(),df[col].isna().sum(),df[col].dtype,df[col].unique()[:5]) for col in df.columns],
                       columns=['name','nunique','missing','dtype','value :5'])

# datainfo(stock)
# 기준일자 int
# datainfo(train)
# 기준년월 int

In [389]:
stock=stock[stock['20년7월TOP3대상여부']=='Y']

In [390]:
ylist=stock['종목번호'].tolist()
ylist=pd.DataFrame(ylist).drop_duplicates()[0].tolist()
_list=train.loc[(train['그룹번호']=='MAD01')&(train['기준년월']<=201907)]['종목번호'].tolist()

np.corrcoef(stock.loc[stock['종목번호']=='A004020']['거래금액_만원단위'],stock.loc[stock['종목번호']=='A000100']['거래금액_만원단위'])

array([[1.        , 0.19000077],
       [0.19000077, 1.        ]])

In [391]:
grp=['MAD01','MAD02','MAD03','MAD04','MAD05','MAD06','MAD07','MAD08','MAD09','MAD10',
    'MAD11','MAD12','MAD13','MAD14','MAD15','MAD16','MAD17','MAD18','MAD19','MAD20',
    'MAD21','MAD22','MAD23','MAD24','MAD25','MAD26','MAD27','MAD28','MAD29','MAD30',
    'MAD31','MAD32','MAD33','MAD34','MAD35','MAD36','MAD37','MAD38','MAD39','MAD40',
    'MAD41','MAD42','MAD43','MAD44','MAD45','MAD46','MAD47','MAD48']

In [392]:
def getGroup(start,end,date,g):
    '''
    if you want to get 201907
    start 20190700
    end   20190800
    date  201907   
    '''
    group=stock.loc[(start<stock['기준일자'])&(stock['기준일자']<end)] # change 

    aggregation={
        '종목시가':np.mean,
        '종목고가':np.max,
        '종목저가':np.min,
        '종목종가':np.mean,
        '거래량':np.sum,
        '거래금액_만원단위':np.sum
    }
    # change
    group=group.groupby(['종목번호','시장구분','표준산업구분코드_대분류','표준산업구분코드_중분류','표준산업구분코드_소분류']).agg(aggregation).reset_index()
    # change
    _list=train.loc[(train['그룹번호']==g)&(train['기준년월']==date)]['종목번호'].tolist()
    # change
    group['binary']=group['종목번호'].apply(lambda num:1 if num in _list else 0)
    group['기준년월']=date
    return group

In [393]:
def getCat(df):
    df['종목번호']=df['종목번호'].astype('category')
    df['시장구분']=df['시장구분'].astype('category')
    df['기준년월']=df['기준년월'].astype('category')
    df['표준산업구분코드_대분류']=df['표준산업구분코드_대분류'].astype('category')
    df['표준산업구분코드_중분류']=df['표준산업구분코드_중분류'].astype('category')
    df['표준산업구분코드_소분류']=df['표준산업구분코드_소분류'].astype('category')
    return df

In [394]:
def lgb_tr(X_train,y_train,X_test,idx):
    params={
        'objective':'binary',
        'metric':'binary_logloss',
        'verbosity':0,
        'learning_rate':0.5,
        'seed':2020
    }

    result=pd.DataFrame()

    lgb_train=lightgbm.Dataset(X_train,y_train)

    model=lightgbm.train(params,lgb_train,num_boost_round=1000)

    pred=model.predict(X_test,num_iteration=model.best_iteration)

    result['yPred']=pred
    result=pd.concat([idx,result],axis=1)
    result=result.sort_values(by='yPred',ascending=False)[:3]
    
    return result.iloc[0]['mainNum'],result.iloc[1]['mainNum'],result.iloc[2]['mainNum']

In [395]:
def result(g):
    group201907=getGroup(20190700,20190800,201907,g)
    group201908=getGroup(20190800,20190900,201908,g)
    group201909=getGroup(20190900,20191000,201909,g)
    group201910=getGroup(20191000,20191100,201910,g)
    group201911=getGroup(20191100,20191200,201911,g)
    group201912=getGroup(20191200,20200100,201912,g)
    group202001=getGroup(20200100,20200200,202001,g)
    group202002=getGroup(20200200,20200300,202002,g)
    group202003=getGroup(20200300,20200400,202003,g)
    group202004=getGroup(20200400,20200500,202004,g)
    group202005=getGroup(20200500,20200600,202005,g)
    group202006=getGroup(20200600,20200700,202006,g)
    group202007=getGroup(20200700,20200800,202007,g)

    lst=[group201907,group201908,group201909,group201910,group201911,group201912,group202001,
         group202002,group202003,group202004,group202005,group202006,group202007]

    group=pd.concat(lst,ignore_index=True)

    group=getCat(group)
    group.rename(columns={
        '종목번호':'mainNum',
        '시장구분':'gubun',
        '표준산업구분코드_대분류':'dae',
        '표준산업구분코드_중분류':'jung',
        '표준산업구분코드_소분류':'so',
        '종목시가':'first',
        '종목고가':'high',
        '종목저가':'low',
        '종목종가':'end',
        '거래량':'amount',
        '거래금액_만원단위':'value',
        '기준년월':'date'
    },inplace=True)
    idx=group['mainNum'].iloc[-group202007.shape[0]:].reset_index(drop=True)
    
    encoder=sklearn.preprocessing.LabelEncoder()
    encoder.fit(group['mainNum'])
    group['mainNum']=encoder.fit_transform(group['mainNum'])
    encoder.fit(group['gubun'])
    group['gubun']=encoder.fit_transform(group['gubun'])    
    encoder.fit(group['dae'])
    group['dae']=encoder.fit_transform(group['dae'])
    encoder.fit(group['jung'])
    group['jung']=encoder.fit_transform(group['jung'])
    encoder.fit(group['so'])
    group['so']=encoder.fit_transform(group['so'])
    encoder.fit(group['date'])
    group['date']=encoder.fit_transform(group['date'])
    
    X_train=group.iloc[:-group202007.shape[0]].drop(['binary'],axis=1)
    y_train=group['binary'][:-group202007.shape[0]]
    X_test=group.iloc[-group202007.shape[0]:].drop(['binary'],axis=1)

    num1,num2,num3=lgb_tr(X_train,y_train,X_test,idx)
    out=tuple(np.insert(sorted([num1,num2,num3]),0,g))
    return out

In [396]:
_list=[]

for g in grp:
    out=result(g)
    _list.append(out)

In [397]:
submission=pd.DataFrame(_list,columns=['그룹명','종목번호1','종목번호2','종목번호3'])
submission

Unnamed: 0,그룹명,종목번호1,종목번호2,종목번호3
0,MAD01,A005930,A035720,A096530
1,MAD02,A090430,A139050,A235980
2,MAD03,A005930,A019170,A096530
3,MAD04,A000660,A003000,A005930
4,MAD05,A000660,A007570,A096530
5,MAD06,A004020,A005380,A068270
6,MAD07,A051910,A096530,A285130
7,MAD08,A003000,A019170,A096530
8,MAD09,A000660,A003000,A005380
9,MAD10,A003000,A005930,A035420


In [398]:
# submission.to_csv('submission7.csv',index=False)