In [1]:
import jieba as jb
import numpy as np
from gensim import corpora, models, similarities
import pandas as pd
from pprint import pprint
from sklearn.cluster import AffinityPropagation
import io
import time
import requests
import scipy.spatial
import random
from matplotlib import pyplot as plt

import argparse

import networkx as nx
import node2vec
from gensim.models import Word2Vec

import os



In [2]:
fundhold = pd.read_csv('data/mutualfundholding.csv')

fund = np.array(fundhold)[:, 0]
date = np.array(fundhold)[:, 1]
stock = np.array(fundhold)[:, 3]
value = np.array(fundhold)[:, 4]

index = np.where((date >= 20100000) & (date <= 20170000))[0]

raw_funds = fund[index]
raw_dates = date[index]
raw_stocks = stock[index]
raw_values = value[index]

set_funds = set()
set_dates = set()
set_stocks = set()
for fund in raw_funds:
    if fund not in set_funds:
        set_funds.add(fund)
for date in raw_dates:
    if date not in set_dates:
        set_dates.add(date)
for stock in raw_stocks:
    if stock not in set_stocks:
        set_stocks.add(stock)

list_funds = list(set_funds)
list_stocks = list(set_stocks)

In [3]:
select_date = []
for year in range(20100000, 20170000, 10000):
    for month_day in [0, 400, 700, 1000]:
        select_date.append(year+month_day)
select_date = np.array(select_date)

In [4]:
weight_matrix = np.zeros((len(select_date), len(set_stocks), len(set_funds)))
for ind in range(len(raw_funds)):
    fund_index = list_funds.index(raw_funds[ind])
    stock_index = list_stocks.index(raw_stocks[ind])
    time_index = np.where(select_date < raw_dates[ind])[0][-1]
    weight_matrix[time_index, stock_index, fund_index] = raw_values[ind]

# Prediction

## Stock Data Storing

In [4]:
rootDir = '/data/AMC/06_stock_price_weekly/'
paths = []
for file_name in os.listdir(rootDir):
    paths.append(os.path.join(rootDir, file_name))
paths.sort()
price_file = pd.DataFrame()
for path in paths:
    csv_price = pd.read_csv(path, encoding='gbk')
    price_file = pd.concat([price_file, csv_price], ignore_index=True)

rootDir = '/data/AMC/03_stock_features_daily/'
fac_file = []
dir_name_list = list(os.listdir(rootDir))
dir_name_list.sort()
for dir_name in dir_name_list:
    print(dir_name)
    fac_dir = os.path.join(rootDir, dir_name)
    paths = []
    for file_name in os.listdir(fac_dir):
        paths.append(os.path.join(fac_dir, file_name))
    paths.sort()
    fea_file = pd.DataFrame()
    for path in paths:
        csv_price = pd.read_csv(path, encoding='gbk')
        fea_file = pd.concat([fea_file, csv_price])
    fac_file.append(fea_file)   

01_K_line
02_bias
03_ROC
04_amplitude
05_MACD
06_K_line_combination
07_stock_short_high_point
08_short_low_point
09_middle_high_point
10_middle_low_point


In [5]:
rootDir = '/data/AMC/06_stock_price_weekly/'
paths = []
file_name = os.listdir(rootDir)[0]
path = os.path.join(rootDir, file_name)
csv_price = pd.read_csv(path, encoding='gbk')
price_columns = csv_price.columns

In [6]:
price_columns

Index([u'FirstIndustryCode', u'TradingDay', u'FirstIndustryName', u'SecuCode',
       u'SecuAbbr', u'PrevClosePrice', u'OpenPrice', u'HighPrice', u'LowPrice',
       u'ClosePrice', u'TurnoverVolume', u'TurnoverValue', u'TurnoverDeals',
       u'TotalMV', u'IfWeekEnd'],
      dtype='object')

In [7]:
index = 0
dict_all = {}
a_price_file = np.array(price_file)
while index < a_price_file.shape[0]:
    index_tmp = np.where(a_price_file[:, 3] == a_price_file[index, 3])[0]
    dict_all[a_price_file[index, 3]] = a_price_file[index_tmp, :]
    index = index_tmp[-1] + 1

In [None]:
dict_price = dict_all

In [None]:
dict_feature = {}
for fac in fac_file:
    index = 0
    a_fac = np.array(fac)
    while index < a_fac.shape[0]:
        index_tmp = np.where(a_fac[:, 3] == a_fac[index, 3])[0]
        if a_fac[index, 3] not in dict_feature:
            dict_feature[a_fac[index, 3]] = []
        dict_feature[a_fac[index, 3]].append(a_fac[index_tmp, :])
        index = index_tmp[-1] + 1

In [None]:
dict_feature_concat = {}
for it in dict_feature:
    if len(dict_feature[it]) != 10:
        continue
    if dict_feature[it][0].shape[0] != 2874:
        continue
    tmp = dict_feature[it][0][:, 1]
    tmp = tmp[:, np.newaxis].astype(str)
    dict_feature_concat[it] = tmp
    
    for it2 in dict_feature[it]:
        dict_feature_concat[it] = np.concatenate((dict_feature_concat[it], it2[:, 5:-2]), axis=1)
    dict_feature_concat[it] = np.delete(dict_feature_concat[it], 20, 1)

In [191]:
for it in dict_feature_concat:
    if it not in dict_price:
        continue
    pd.DataFrame(dict_feature_concat[it], columns=feature_columns).to_csv('stock_data/'+str(it)+'_feature.csv', encoding='GBK', index=False)
    pd.DataFrame(dict_price[it], columns=price_columns).to_csv('stock_data/'+str(it)+'_price.csv', encoding='GBK', index=False)

In [161]:
def get_data(start_date, end_date, stock_code):
    price_csv = pd.read_csv('stock_data/'+str(stock_code)+'_price.csv')
    dates = np.array(price_csv['TradingDay'])
    for count in range(len(dates)):
        dates[count] = int(''.join(str(dates[count]).split()[0].split('-')))
    select_index = np.where((dates >= start_date) & (dates < end_date))[0]
    return_price = np.array(price_csv['ClosePrice'][select_index])
    
    feature_csv = pd.read_csv('stock_data/'+str(stock_code)+'_feature.csv')
    dates = np.array(feature_csv['trading_day'])
    for count in range(len(dates)):
        dates[count] = int(''.join(str(dates[count]).split()[0].split('-')))
    select_index = np.where((dates >= start_date) & (dates < end_date))[0]
    return_feature = np.array(feature_csv)[select_index, 1:]  
    
    return return_price, return_feature

In [178]:
feature_csv = pd.read_csv('stock_data/'+str(158)+'_feature.csv')

# Season Data

In [176]:
season = []
for year in range(2006, 2017):
    for month in [0, 400, 700, 1000]:
        season.append(year*10000+month)
season.append(20170000)

In [192]:
for it in dict_feature_concat:
    if it not in dict_price:
        continue
    season_data = np.zeros((len(season)-1, 45))
    for season_id in range(len(season)-1):
        a_price, a_feature = get_data(season[season_id], season[season_id+1], it)
        average_feature = a_feature[0]
        average_price = a_price.mean()
        for idx in range(len(a_feature)):
            average_feature = 0.4 * average_feature + 0.6 * a_feature[idx]
        season_data[season_id, 0] = average_price
        season_data[season_id, 1:] = average_feature
    pd.DataFrame(season_data).to_csv('stock_data/'+str(it)+'_SeasonAll.csv', encoding='GBK', index=False)

In [187]:
get_data(season[season_id], season[season_id+1], it)

(array([], dtype=float64), array([], shape=(0, 44), dtype=object))

In [182]:
season

[20060000,
 20060400,
 20060700,
 20061000,
 20070000,
 20070400,
 20070700,
 20071000,
 20080000,
 20080400,
 20080700,
 20081000,
 20090000,
 20090400,
 20090700,
 20091000,
 20100000,
 20100400,
 20100700,
 20101000,
 20110000,
 20110400,
 20110700,
 20111000,
 20120000,
 20120400,
 20120700,
 20121000,
 20130000,
 20130400,
 20130700,
 20131000,
 20140000,
 20140400,
 20140700,
 20141000,
 20150000,
 20150400,
 20150700,
 20151000,
 20160000,
 20160400,
 20160700,
 20161000,
 20170000]

In [189]:
dict_feature[1][0].shape[0]

2874

In [1]:
import os
len(os.listdir('data/stock_price'))

2972