# 文字預測股票漲跌專案

***根據網路上各類新聞文章預測某一公司股價的漲跌***


台大經研所 羅偉駿

![LinkedIn](https://img.shields.io/badge/linkedin-%230077B5.svg?style=for-the-badge&logo=linkedin&logoColor=white)
![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)
![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)

In [6]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import datetime
import os
from args import rawdata_path, workdata_path
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Extract target stock from Dataset

In [9]:
from args import stock_code
from etl_func.etl_data import extract_stock_df

os.chdir(rawdata_path)
stock_df = extract_stock_df(stock_code)

os.chdir(workdata_path)
stock_df.to_parquet(stock_code + '_stock_df.parquet', index=False)
stock_df.head()

Unnamed: 0,Code,Date,Price
0,2454 聯發科,2019-01-02,210.470795
1,2454 聯發科,2019-01-03,213.334305
2,2454 聯發科,2019-01-04,207.130005
3,2454 聯發科,2019-01-07,209.039001
4,2454 聯發科,2019-01-08,211.902603


# 2. Cut Article Words

In [10]:
from etl_func.cut_text import Cut_Machine
from args import (
    company, kw_list, data_time, datafile_name,
    kw_title_num, kw_content_num
    )

os.chdir(rawdata_path)
final_df = pd.DataFrame()
for source in datafile_name:
    cut_machine = Cut_Machine(
        articles_source=datafile_name[source],
        data_time=data_time
        )
    cut_machine.filter_article(
        keywords=kw_list,
        title_times=kw_title_num,
        content_times=kw_content_num
        )
    word_df = cut_machine.Pool_sep_all_articles()
    final_df = pd.concat([final_df, word_df], ignore_index=True)
    print('Finish:' + source)

os.chdir(workdata_path)
final_df.to_parquet(company + '_word_df.parquet', index=False)

Finish:bbs
Finish:news2019
Finish:news2020
Finish:news2021
Finish:forum2019
Finish:forum2020
Finish:forum2021


# 3. Get X, Y for classification

In [18]:
from args import day_arg, cutoff_arg, features_num
from df_func.make_XY import Words_Matrix, feature_X_byChi2
from etl_func.etl_data import transform_stock_df, read_stop_words

os.chdir(rawdata_path)
stop_words = read_stop_words()

os.chdir(workdata_path)
word_df = pd.read_parquet(company + '_word_df.parquet')
stock_df = pd.read_parquet(stock_code + '_stock_df.parquet')
stock_df = transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg)

words_matrix = Words_Matrix(word_df, stock_df, data_time, stop_words)
X, Y = words_matrix.X_matrix, words_matrix.Y_matrix
X = X[ feature_X_byChi2(X, Y, k=features_num) ]

# 4. Try training model

In [19]:
from sklearn.model_selection import cross_val_score
from args import classifier_dict

for classifier in classifier_dict:
    clf = classifier_dict[classifier]()
    scores = cross_val_score(clf, X , Y, cv = 5)
    print(classifier, ':', round(scores.mean(), 3))
    print(scores)

kNN : nan
[nan nan nan nan nan]
Ridge : 0.893
[0.89230769 0.89128205 0.89538462 0.89527721 0.89219713]
Desision Tree : 0.867
[0.88512821 0.85333333 0.8625641  0.87782341 0.85626283]
Random Forest : 0.894
[0.89948718 0.88820513 0.89333333 0.89219713 0.8963039 ]
Gradient Boosting : 0.892
[0.89538462 0.88512821 0.89230769 0.89219713 0.89322382]
MLP : 0.895
[0.90153846 0.89025641 0.89333333 0.90349076 0.88501027]


# 5. Hyperparameters Choosing

In [None]:
from df_func.train import create_train_function
os.chdir(workdata_path)

In [None]:
cut_list = [0, 0.01, 0.02, 0.03, 0.04]
train_k_feature = create_train_function('cutoff_arg')
results = train_k_feature(cut_list)
df = pd.DataFrame(results)
df.to_csv('cutoff_arg.csv', index=False)

In [None]:
lag_list = [1, 2, 3, 4, 5]
train_k_feature = create_train_function('day_arg')
results = train_k_feature(lag_list)
df = pd.DataFrame(results)
df.to_csv('day_arg.csv', index=False)

In [None]:
k_list = [500, 1000, 1500, 2000, 2500]
train_k_feature = create_train_function('features_num')
results = train_k_feature(k_list)
df = pd.DataFrame(results)
df.to_csv('features_num.csv', index=False)

# 6. Train & Test 1: 過去測試未來

In [7]:
from sklearn.linear_model import RidgeClassifier
from etl_func.etl_data import transform_stock_df
from df_func.make_XY import Words_Matrix
from df_func.predict import Predict_Machine
from args import (
    word_df, stock_df, stop_words,
    day_arg, cutoff_arg, features_num
    )

train_words_matrix = Words_Matrix(
    word_df=word_df,
    stock_df=transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg),
    data_time=(datetime.date(2019,1,1), datetime.date(2021,5,31)),
    stop_words=stop_words
    )

test_words_matrix = Words_Matrix(
    word_df=word_df,
    stock_df=transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg),
    data_time=(datetime.date(2021,6,1), datetime.date(2021,12,31)),
    stop_words=stop_words
    )

RDclf = RidgeClassifier(alpha = 0.05)

ridge_machine = Predict_Machine(
    train_words_matrix=train_words_matrix,
    test_words_matrix=test_words_matrix,
    features_num=features_num,
    classifier=RDclf
    )

In [19]:
print('Confusion Matrix:\n', ridge_machine.show_confusion())
print()
print('Accuracy Score:', round(ridge_machine.show_accuracy(), 3))

Confusion Matrix:
           True  False
Positive   103      1
Negative    14      0

Accuracy Score: 0.873


# 7. Train & Test 2: 移動回測

In [47]:
from sklearn.linear_model import RidgeClassifier
from etl_func.etl_data import transform_stock_df
from df_func.make_XY import Words_Matrix
from df_func.predict import Predict_Machine, Date_Machine
from args import (
    word_df, stock_df, stop_words,
    day_arg, cutoff_arg, features_num,
    data_time
    )

result_matrix = pd.DataFrame(
            [[0, 0], [0, 0]],
            index=['Positive', 'Negative'],
            columns=['True', 'False'],
            )
date_machine = Date_Machine(train_duration=5, test_duration=2, data_time=data_time)

for i in range(30):
    date_machine.index = i

    train_words_matrix = Words_Matrix(
        word_df=word_df,
        stock_df=transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg),
        data_time=date_machine.train_date,
        stop_words=stop_words
        )

    test_words_matrix = Words_Matrix(
        word_df=word_df,
        stock_df=transform_stock_df(stock_df, D=day_arg, cutoff=cutoff_arg),
        data_time=date_machine.test_date,
        stop_words=stop_words
        )

    RDclf = RidgeClassifier(alpha = 0.05)

    ridge_machine = Predict_Machine(
        train_words_matrix=train_words_matrix,
        test_words_matrix=test_words_matrix,
        features_num=features_num,
        classifier=RDclf
        )
    print(f'{i}th Accuracy Score:', round(ridge_machine.show_accuracy(), 3))

    try:
        result_matrix += ridge_machine.show_confusion()
    except Exception as e:
        print(e)

print('Confusion Matrix in total:\n', result_matrix)

  0%|          | 0/30 [00:00<?, ?it/s]

0th Accuracy Score: 0.941


  3%|▎         | 1/30 [00:11<05:38, 11.68s/it]

1th Accuracy Score: 0.971


  7%|▋         | 2/30 [00:20<04:47, 10.27s/it]

2th Accuracy Score: 0.968


 10%|█         | 3/30 [00:30<04:23,  9.77s/it]

3th Accuracy Score: 0.938


 13%|█▎        | 4/30 [00:39<04:06,  9.50s/it]

4th Accuracy Score: 0.97


 17%|█▋        | 5/30 [00:48<03:54,  9.40s/it]

5th Accuracy Score: 0.939


 20%|██        | 6/30 [00:58<03:48,  9.54s/it]

6th Accuracy Score: 0.926


 23%|██▎       | 7/30 [01:10<03:57, 10.34s/it]

7th Accuracy Score: 1.0


 27%|██▋       | 8/30 [01:19<03:38,  9.92s/it]

Shape of passed values is (1, 1), indices imply (2, 2)
8th Accuracy Score: 1.0


 30%|███       | 9/30 [01:28<03:22,  9.63s/it]

Shape of passed values is (1, 1), indices imply (2, 2)
9th Accuracy Score: 0.958


 33%|███▎      | 10/30 [01:37<03:09,  9.46s/it]

10th Accuracy Score: 0.923


 37%|███▋      | 11/30 [01:45<02:54,  9.21s/it]

11th Accuracy Score: 0.929


 40%|████      | 12/30 [01:54<02:42,  9.04s/it]

12th Accuracy Score: 0.818


 43%|████▎     | 13/30 [02:02<02:29,  8.81s/it]

13th Accuracy Score: 0.71


 47%|████▋     | 14/30 [02:11<02:18,  8.66s/it]

14th Accuracy Score: 0.645


 50%|█████     | 15/30 [02:23<02:27,  9.84s/it]

15th Accuracy Score: 0.719


 53%|█████▎    | 16/30 [02:37<02:32, 10.88s/it]

16th Accuracy Score: 0.781


 57%|█████▋    | 17/30 [02:50<02:33, 11.77s/it]

17th Accuracy Score: 0.743


 60%|██████    | 18/30 [03:03<02:22, 11.90s/it]

18th Accuracy Score: 0.765


 63%|██████▎   | 19/30 [03:18<02:22, 13.00s/it]

19th Accuracy Score: 0.769


 67%|██████▋   | 20/30 [03:34<02:18, 13.88s/it]

20th Accuracy Score: 0.821


 70%|███████   | 21/30 [03:52<02:15, 15.11s/it]

21th Accuracy Score: 0.844


 73%|███████▎  | 22/30 [04:10<02:06, 15.86s/it]

22th Accuracy Score: 0.806


 77%|███████▋  | 23/30 [04:26<01:51, 15.95s/it]

23th Accuracy Score: 0.794


 80%|████████  | 24/30 [04:44<01:40, 16.67s/it]

24th Accuracy Score: 0.824


 83%|████████▎ | 25/30 [05:01<01:23, 16.78s/it]

25th Accuracy Score: 0.8


 87%|████████▋ | 26/30 [05:17<01:05, 16.44s/it]

26th Accuracy Score: 0.882


 90%|█████████ | 27/30 [05:34<00:49, 16.53s/it]

27th Accuracy Score: 0.935


 93%|█████████▎| 28/30 [05:52<00:34, 17.01s/it]

28th Accuracy Score: 0.909


 97%|█████████▋| 29/30 [06:10<00:17, 17.24s/it]

29th Accuracy Score: 0.943


100%|██████████| 30/30 [06:26<00:00, 12.87s/it]

Confusion Matrix in total:
           True  False
Positive   756      0
Negative   128      0



