In [4]:
import pandas as pd
import datetime
from datetime import date
import os
from args import rawdata_path, workdata_path
import warnings
warnings.filterwarnings('ignore')

# 1. Extract target stock from Dataset

In [None]:
from args import stock_code
from etl_func.etl_data import extract_stock_df

os.chdir(rawdata_path)
stock_df = extract_stock_df(stock_code)

os.chdir(workdata_path)
stock_df.to_parquet(stock_code + '_stock_df.parquet', index=False)

# 2. Cut Article Words

In [None]:
from etl_func.cut_text import Cut_Machine
from args import (
    company, data_time, datafile_name,
    kw_title_num, kw_content_num
    )

os.chdir(rawdata_path)
final_df = pd.DataFrame()
for source in datafile_name:
    cut_machine = Cut_Machine(
        articles_source=datafile_name[source],
        data_time=data_time
        )
    cut_machine.filter_article(
        keywords=[company],
        title_times=kw_title_num,
        content_times=kw_content_num
        )
    word_df = cut_machine.Pool_sep_all_articles()
    final_df = pd.concat([final_df, word_df], ignore_index=True)
    print('Finish:' + source)

os.chdir(workdata_path)
final_df.to_parquet(company + '_word_df.parquet', index=False)

# 3. Get X, Y for classification

In [None]:
from args import day_arg, cutoff_arg, features_num
from df_func.make_XY import Words_Matrix
from etl_func.etl_data import transform_stock_df, read_stop_words

os.chdir(workdata_path)
stop_words = read_stop_words(workdata_path)
word_df = pd.read_parquet(company + '_word_df.parquet')
stock_df = pd.read_parquet(stock_code + '_stock_df.parquet')
stock_df = transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg)

words_matrix = Words_Matrix(word_df, stock_df, data_time, stop_words)
X = words_matrix.feature_X_byChi2(k=features_num)
Y = words_matrix.Y_matrix

# 4. Try training model

In [None]:
from sklearn.model_selection import cross_val_score
from args import classifier_dict

for classifier in classifier_dict:
    clf = classifier_dict[classifier]()
    scores = cross_val_score(clf, X , Y, cv = 5)
    print(classifier, ':', round(scores.mean(), 3))
    print(scores)

# 5. Find the best parameters

In [None]:
from df_func.train import create_train_function

In [None]:
cut_list = [0, 0.01, 0.02, 0.03, 0.04]
train_k_feature = create_train_function('cutoff_arg')
results = train_k_feature(cut_list)
df = pd.DataFrame(results)
df.to_csv('cutoff_arg.csv', index=False)

In [None]:
lag_list = [1, 2, 3, 4, 5]
train_k_feature = create_train_function('day_arg')
results = train_k_feature(lag_list)
df = pd.DataFrame(results)
df.to_csv('day_arg.csv', index=False)

In [None]:
k_list = [500, 1000, 1500, 2000, 2500]
train_k_feature = create_train_function('features_num')
results = train_k_feature(k_list)
df = pd.DataFrame(results)
df.to_csv('features_num.csv', index=False)

# 6. Train & Test 1: 過去測試未來

In [None]:
from etl_func.etl_data import transform_stock_df
from df_func.make_XY import Words_Matrix
from df_func.predict import Predict_Machine
from sklearn.linear_model import RidgeClassifier

train_words_matrix = Words_Matrix(
    word_df=word_df,
    stock_df=transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg),
    data_time=(datetime.date(2019,1,1), datetime.date(2020,12,31)),
    stop_words=stop_words
    )

test_words_matrix = Words_Matrix(
    word_df=word_df,
    stock_df=transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg),
    data_time=(datetime.date(2021,1,1), datetime.date(2021,12,31)),
    stop_words=stop_words
    )

RDclf = RidgeClassifier(alpha = 0.05)

ridge_machine = Predict_Machine(
    train_words_matrix=train_words_matrix,
    test_words_matrix=test_words_matrix,
    features_num=features_num,
    classifier=RDclf
    )

In [None]:
ridge_machine.show_accuracy()
ridge_machine.show_confusion()

# 7. Train & Test 2: 移動回測