In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('stock_comments_analyzed.csv', parse_dates=['created_time'])
grouped = df['polarity'].groupby(df.created_time.dt.date)

In [3]:
df.tail()

Unnamed: 0,created_time,title,polarity
464226,2020-10-17 06:10:00,外围 昨天 向上 美股 欧股 股市 涨跌 暴涨,1
464227,2020-10-17 06:10:00,满仓 龙头 干 券商 怂 国联 证券,1
464228,2020-10-17 06:06:00,突破 前高 下周 逼空 行情,0
464229,2020-10-17 05:33:00,十年 真的 观天象 夜 满仓 干 怂 牛市 轮,1
464230,2020-10-17 05:29:00,洞悉 资 先于 股 必读 汇总 数据 市场 操盘 主力 看股,0


In [4]:
def BI_Simple_func(row):
    pos = row[row == 1].count()
    neg = row[row == 0].count()

    return (pos-neg)/(pos+neg)

In [5]:
BI_Simple_index = grouped.apply(BI_Simple_func)

In [6]:
BI_Simple_index.head()

created_time
2020-10-17    0.159091
2020-10-18    0.163498
2020-10-19   -0.150339
2020-10-20   -0.087477
2020-10-21   -0.133661
Name: polarity, dtype: float64

In [7]:
def BI_func(row):
    pos = row[row == 1].count()
    neg = row[row == 0].count()

    bi = np.log(1.0 * (1+pos) / (1+neg))

    return bi

In [8]:
BI_index = grouped.apply(BI_func)

In [9]:
BI_index.head()

created_time
2020-10-17    0.319064
2020-10-18    0.328685
2020-10-19   -0.302835
2020-10-20   -0.175275
2020-10-21   -0.268797
Name: polarity, dtype: float64

In [10]:
sentiment_idx = pd.concat([BI_index.rename('BI'), BI_Simple_index.rename('BI_Simple')], axis=1)

In [11]:
sentiment_idx.head()

Unnamed: 0_level_0,BI,BI_Simple
created_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-10-17,0.319064,0.159091
2020-10-18,0.328685,0.163498
2020-10-19,-0.302835,-0.150339
2020-10-20,-0.175275,-0.087477
2020-10-21,-0.268797,-0.133661


In [12]:
quotes = pd.read_csv('./data/sh000001.csv', parse_dates=['date'])
quotes.set_index('date', inplace=True)

In [13]:
quotes.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-10-19,3351.1,3371.09,3307.84,3312.67,21230354000
2020-10-20,3307.15,3328.1,3293.77,3328.1,16508476200
2020-10-21,3332.15,3332.15,3304.17,3325.02,17763211500
2020-10-22,3315.82,3320.88,3281.37,3312.5,16594998700
2020-10-23,3308.16,3326.05,3276.62,3278.0,17420898200


In [14]:
sentiment_idx.index = pd.to_datetime(sentiment_idx.index)
merged = pd.merge(sentiment_idx, quotes, how='left', left_index=True, right_index=True)

In [15]:
merged.head()

Unnamed: 0_level_0,BI,BI_Simple,open,high,low,close,volume
created_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-17,0.319064,0.159091,,,,,
2020-10-18,0.328685,0.163498,,,,,
2020-10-19,-0.302835,-0.150339,3351.1,3371.09,3307.84,3312.67,21230350000.0
2020-10-20,-0.175275,-0.087477,3307.15,3328.1,3293.77,3328.1,16508480000.0
2020-10-21,-0.268797,-0.133661,3332.15,3332.15,3304.17,3325.02,17763210000.0


In [16]:
merged.fillna(method='ffill', inplace=True)
merged['BI_MA'] = merged['BI'].rolling(window=10, center=False).mean()
merged['BI_Simple_MA'] = merged['BI_Simple'].rolling(window=10, center=False).mean()

In [17]:
merged.head()

Unnamed: 0_level_0,BI,BI_Simple,open,high,low,close,volume,BI_MA,BI_Simple_MA
created_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-10-17,0.319064,0.159091,,,,,,,
2020-10-18,0.328685,0.163498,,,,,,,
2020-10-19,-0.302835,-0.150339,3351.1,3371.09,3307.84,3312.67,21230350000.0,,
2020-10-20,-0.175275,-0.087477,3307.15,3328.1,3293.77,3328.1,16508480000.0,,
2020-10-21,-0.268797,-0.133661,3332.15,3332.15,3304.17,3325.02,17763210000.0,,


In [18]:
merged.to_csv('merged_sentiment_idx.csv')