In [1]:
import tushare as ts
import pandas as pd
import time
import datetime
from tqdm import tqdm
import re
import pprint
import json

# 从这里开始

In [3]:
relation_df = pd.read_csv('relation.csv')
relation_df = relation_df.drop(columns=['end_date', 'hold_amount'])

In [4]:
company_df = pd.read_csv('new_import/company.csv')

In [5]:
len(relation_df)

6700

In [6]:
def filter_func(s: str) -> str:
    s = s.split('-')[0]  # 去掉横杠
    s = s.split('一')[0]
    s = re.sub('\\(.*\\)', '', s)  # 去掉空格和内容
    s = re.sub('股份|责任|有限|公司|集团|省|市', '', s)
    return s

In [7]:
company_df.head()

Unnamed: 0,companyId:ID(Company),name:string,fullname:string,:LABEL
0,000001.SZ,平安银行,平安银行股份有限公司,Company
1,000002.SZ,万科A,万科企业股份有限公司,Company
2,000005.SZ,世纪星源,深圳世纪星源股份有限公司,Company
3,000006.SZ,深振业A,深圳市振业(集团)股份有限公司,Company
4,000007.SZ,全新好,深圳市全新好股份有限公司,Company


In [8]:
comp_name2code = dict()
for i, row in company_df.iterrows():
    comp_name2code[filter_func(row['fullname:string'])] = row['companyId:ID(Company)']

In [9]:
comp_name2code['平安银行']

'000001.SZ'

In [10]:
relation_df.head()

Unnamed: 0,ts_code,ann_date,holder_name,hold_ratio
0,600075.SH,20201016,李慧琼,1.35
1,600075.SH,20201016,新疆天业(集团)有限公司,53.57
2,600075.SH,20201016,石河子市锦富国有资本投资运营有限公司,4.98
3,600075.SH,20201016,上海银叶投资有限公司-银叶攻玉5号私募证券投资基金,2.24
4,600075.SH,20201016,万家共赢-中信银行-万家共赢安泰1号专项资产管理计划,1.98


In [11]:
relation_df[relation_df['holder_name'] == '天津中冀万泰投资管理有限公司']

Unnamed: 0,ts_code,ann_date,holder_name,hold_ratio
4316,000657.SZ,20201029,天津中冀万泰投资管理有限公司,1.12


In [12]:
relation_df = relation_df.drop_duplicates(['ts_code', 'ann_date', 'holder_name'])

In [13]:
len(relation_df)

6685

In [14]:
def get_code(s: str) -> str:
    s = filter_func(s)
    if s in comp_name2code:
        return comp_name2code[s]
    return ''

In [15]:
relation_df['holder_code'] = relation_df['holder_name'].apply(get_code)

In [16]:
relation_df.head()

Unnamed: 0,ts_code,ann_date,holder_name,hold_ratio,holder_code
0,600075.SH,20201016,李慧琼,1.35,
1,600075.SH,20201016,新疆天业(集团)有限公司,53.57,600075.SH
2,600075.SH,20201016,石河子市锦富国有资本投资运营有限公司,4.98,
3,600075.SH,20201016,上海银叶投资有限公司-银叶攻玉5号私募证券投资基金,2.24,
4,600075.SH,20201016,万家共赢-中信银行-万家共赢安泰1号专项资产管理计划,1.98,


In [17]:
listed_relation_df = relation_df[relation_df['holder_code'] != '']

In [18]:
len(listed_relation_df)

452

In [19]:
listed_relation_df.head()

Unnamed: 0,ts_code,ann_date,holder_name,hold_ratio,holder_code
1,600075.SH,20201016,新疆天业(集团)有限公司,53.57,600075.SH
50,600155.SH,20201030,华创阳安股份有限公司-第二期员工持股计划,2.84,600155.SH
142,600301.SH,20201029,南宁化工集团有限公司,32.0,600301.SH
204,600352.SH,20201027,浙江龙盛集团股份有限公司-2020年员工持股计划,2.15,600352.SH
212,600367.SH,20201017,中国建设银行股份有限公司-信达澳银新能源产业股票型证券投资基金,0.44,601939.SH


In [20]:
company_df[company_df['companyId:ID(Company)'] == '600155.SH']

Unnamed: 0,companyId:ID(Company),name:string,fullname:string,:LABEL
462,600155.SH,华创阳安,华创阳安股份有限公司,Company


In [26]:
listed_relation_df['ann_date'] = pd.to_datetime(listed_relation_df['ann_date'], format='%Y%m%d')

In [27]:
listed_relation_df.head()

Unnamed: 0,ts_code,ann_date,holder_name,hold_ratio,holder_code
1,600075.SH,2020-10-16,新疆天业(集团)有限公司,53.57,600075.SH
50,600155.SH,2020-10-30,华创阳安股份有限公司-第二期员工持股计划,2.84,600155.SH
142,600301.SH,2020-10-29,南宁化工集团有限公司,32.0,600301.SH
204,600352.SH,2020-10-27,浙江龙盛集团股份有限公司-2020年员工持股计划,2.15,600352.SH
212,600367.SH,2020-10-17,中国建设银行股份有限公司-信达澳银新能源产业股票型证券投资基金,0.44,601939.SH


In [30]:
listed_relation_df.columns = [':END_ID(Company)', 'annDate:date', 'holderFullname:string', 'holdRatio:double', ':START_ID(Company)']

In [31]:
listed_relation_df.head()

Unnamed: 0,:END_ID(Company),annDate: date,holderFullname:string,holdRatio:double,:START_ID(Company)
1,600075.SH,2020-10-16,新疆天业(集团)有限公司,53.57,600075.SH
50,600155.SH,2020-10-30,华创阳安股份有限公司-第二期员工持股计划,2.84,600155.SH
142,600301.SH,2020-10-29,南宁化工集团有限公司,32.0,600301.SH
204,600352.SH,2020-10-27,浙江龙盛集团股份有限公司-2020年员工持股计划,2.15,600352.SH
212,600367.SH,2020-10-17,中国建设银行股份有限公司-信达澳银新能源产业股票型证券投资基金,0.44,601939.SH


In [32]:
listed_relation_df[':TYPE'] = 'hold_stock'

In [33]:
listed_relation_df.head()

Unnamed: 0,:END_ID(Company),annDate: date,holderFullname:string,holdRatio:double,:START_ID(Company),:TYPE
1,600075.SH,2020-10-16,新疆天业(集团)有限公司,53.57,600075.SH,hold_stock
50,600155.SH,2020-10-30,华创阳安股份有限公司-第二期员工持股计划,2.84,600155.SH,hold_stock
142,600301.SH,2020-10-29,南宁化工集团有限公司,32.0,600301.SH,hold_stock
204,600352.SH,2020-10-27,浙江龙盛集团股份有限公司-2020年员工持股计划,2.15,600352.SH,hold_stock
212,600367.SH,2020-10-17,中国建设银行股份有限公司-信达澳银新能源产业股票型证券投资基金,0.44,601939.SH,hold_stock


In [34]:
listed_relation_df.to_csv('new_import/company_company.csv', index=False)