In [1]:
import sys
sys.path.append('D:\\workspace\\Python\\FinDataFetcher\\py')

In [2]:
from PdfReaper import PdfReaper
import UtilFunc as util
import re

In [3]:
sFundCode = '161116'
sSrc = r'D:\Downloads\Documents\{}.pdf'.format(sFundCode)
sDes = r'D:\Downloads\Documents\{}.txt'.format(sFundCode)

In [4]:
reaper = PdfReaper()

In [5]:
reaper.toText(sSrc, sDes)

In [53]:
with open(sDes, encoding='utf-8') as fl:
    sTxt = fl.read()

In [10]:
def getFundCode(sTxt):
    m = re.search(r'基金主代码\s+?(\d{6})', sTxt)
    if m:
        return m.group(1)
    m = re.search(r'交易代码\s+?(\d{6})', sTxt)
    if m:
        return m.group(1)
    return ''
#end def

In [11]:
def getRangeToInnerSecEnd(sTxt, sKeywd):
    iKey = sTxt.find(sKeywd)
    iBgn = sTxt.rfind('\n', 0, iKey)
    sSecHead = sTxt[iBgn+1:iKey]
    sCurSecNo = '1.0'
    sSecNoRex = r'^(\d{1,2}(?:\.\d{1,2})+)[^\d\.]'
    m = re.search(sSecNoRex, sSecHead)
    if m:
        sCurSecNo = m.group(1)
    # Find (innermost) section end
    iCur = iKey
    while True:
        m = re.search(sSecNoRex, sTxt[iCur:], flags=re.MULTILINE)
        if m:
            sNxtSecNo = m.group(1)
            if sNxtSecNo.count('.') > sCurSecNo.count('.'):
                sCurSecNo = sNxtSecNo
                iCur += m.end()
            else:
                return (iKey, iCur+m.start())
        else:
            return (iKey, -1)
    return (iKey, -1)
#end def

In [14]:
def getSectionRange(sTxt, sKeywd):
    iKey = sTxt.find(sKeywd)
    iBgn = sTxt.rfind('\n', 0, iKey)
    sSecHead = sTxt[iBgn+1:iKey]
    sCurSecNo = '1.0'
    sSecNoRex = r'^(\d{1,2}(?:\.\d{1,2})+)[^\d\.]'
    m = re.search(sSecNoRex, sSecHead)
    if not m:
        return (iKey, -1)
    
    sCurSecNo = m.group(1)
    nPts = sCurSecNo.count('.')
    sNxtSibRex = r'^(\d{1,2}(?:\.\d{1,2}){' + str(nPts) + r'})[^\d\.]'
    # Find next sibling section
    m = re.search(sNxtSibRex, sTxt[iKey:], flags=re.MULTILINE)
    if m:
        return (iKey, iKey+m.start())
    return (iKey, -1)
#end def

In [13]:
def getAssetAlloc(sTxt):
    iBgn, iEnd = getRangeToInnerSecEnd(sTxt, '资产组合情况')
    ls = re.findall(r'^[\d\s]*([^\d\s]+).+\s([\d\.-]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dAstAlloc = {}
    for t in ls:
        sType, sRatio = t
        try:
            fRatio = float(sRatio)
        except Exception:
            fRatio = 0.0
        #end try
        dAstAlloc[sType] = fRatio
        if sType == '合计':
            break
    #end for
    return dAstAlloc
#end def

In [14]:
def getIndusAlloc(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '按行业分类的股票')
    ls = re.findall(r'^([A-Z])\s.+\s([\d\.-]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dIndusAlloc = {}
    for t in ls:
        sType, sRatio = t
        try:
            fRatio = float(sRatio)
        except Exception:
            fRatio = 0.0
        #end try
        if sType not in dIndusAlloc:
            dIndusAlloc[sType] = fRatio
        else:
            dIndusAlloc[sType] += fRatio
        #if sType == 'S':
        #    break
    #end for
    return dIndusAlloc
#end def

In [15]:
def getStkHldngs(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '股票投资明细')
    ls = re.findall(r'^(\d{1,3})\s+(\d{6})\s.+\s([\d\.]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dStkHld = {}
    iPrvNo = 0
    for t in ls:
        #print(t)
        sNo, sCode, sRatio = t
        iNo = int(sNo)
        # 如果序号不连续
        if iNo != iPrvNo + 1 and iNo != 1:
            print('Some stock missing between {} and {}'.format(iPrvNo, iNo))
        fRatio = float(sRatio)
        if sCode not in dStkHld:
            dStkHld[sCode] = fRatio
        else:
            dStkHld[sCode] += fRatio
        iPrvNo = iNo
    #end for
    return dStkHld
#end def

In [49]:
def getForgnStks(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '前十名股票')
    ls = re.findall(r'^(\d{1,3})\s.+\s(\d{1,5}\s*HK|\S+\s*US)\s.+\s([\d\.]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dFrgnStk = {}
    iPrvNo = 0
    for t in ls:
        #print(t)
        sNo, sCode, sRatio = t
        iNo = int(sNo)
        # 如果序号不连续
        if iNo != iPrvNo + 1 and iNo != 1:
            print('Some stock missing between {} and {}'.format(iPrvNo, iNo))
        fRatio = float(sRatio)
        sSfx = sCode[-2:]
        if sSfx == 'HK':
            sCode = 'hk' + sCode[:-2].strip().zfill(5)
        elif sSfx == 'US':
            sCode = 'gb_' + sCode[:-2].strip().lower()
        if sCode not in dFrgnStk:
            dFrgnStk[sCode] = fRatio
        else:
            dFrgnStk[sCode] += fRatio
        iPrvNo = iNo
    #end for
    return dFrgnStk
#end def

In [16]:
def getBondAlloc(sTxt):
    iBgn, iEnd = getRangeToInnerSecEnd(sTxt, '债券品种分类')
    ls = re.findall(r'^[\d\s]*([^\d\s]+)\s.+\s([\d\.-]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dAstAlloc = {}
    for t in ls:
        #print(t)
        sType, sRatio = t
        try:
            fRatio = float(sRatio)
        except Exception:
            fRatio = 0.0
        #end try
        dAstAlloc[sType] = fRatio
        if sType == '合计':
            break
    #end for
    return dAstAlloc
#end def

In [17]:
def getBondHldngs(sTxt):
    iBgn, iEnd = getRangeToInnerSecEnd(sTxt, '前五名债券')
    ls = re.findall(r'^(\d{1,3})\s+(\d+)\s.+\s([\d\.]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dStkHld = {}
    iPrvNo = 0
    for t in ls:
        #print(t)
        sNo, sCode, sRatio = t
        iNo = int(sNo)
        if iNo != iPrvNo + 1:
            print('Some bond missing between {} and {}'.format(iPrvNo, iNo))
        fRatio = float(sRatio)
        dStkHld[sCode] = fRatio
        iPrvNo = iNo
    #end for
    return dStkHld
#end def

In [None]:
import imp
#imp.reload(util)
imp.reload(PdfReaper)

In [18]:
def genFundDict(sTxt):
    dFund = {}
    dInfo = {}
    sFundCode = getFundCode(sTxt)
    dInfo['fund_code'] = sFundCode
    dInfo['asset_alloc'] = getAssetAlloc(sTxt)
    dInfo['indus_alloc'] = getIndusAlloc(sTxt)
    dInfo['stk_hldngs'] = getStkHldngs(sTxt)
    dInfo['forgn_stks'] = getForgnStks(sTxt)
    dInfo['bond_alloc'] = getBondAlloc(sTxt)
    dInfo['bond_hldngs'] = getBondHldngs(sTxt)
    dFund[sFundCode] = dInfo
    util.writeTupleDict(dFund, r'D:\Downloads\Documents\{}_dic.txt'.format(sFundCode))

In [19]:
genFundDict(sTxt)

In [41]:
getSectionRange(sTxt, '股票投资明细')

(6602, 6610)

In [130]:
getSectionRange(sTxt, '按行业分类的股票')

(4408, 4422)

In [42]:
sTxt[6602: 6610]

'股票投资明细 \n'

In [149]:
m = re.search('(\d{1,2}(?:\.\d{1,2})+)\s', '3.14  ')

In [154]:
'3.14.15.9.26'.count('.')

4

In [74]:
# 160808 page 7; 160716 page 7, 8; 165510 page 8

'gb_baba,8.18;hk00700,6.56;hk01099,6.04;hk00598,4.74;hk00152,4.72;hk01093,4.52;hk00939,3.73;hk02318,3.18;gb_bidu,3.12;hk06198,3.07'