In [5]:
from py.PdfReaper import PdfReaper

In [6]:
reaper = PdfReaper()

In [7]:
sSrc = r'D:\Downloads\Documents\161222.pdf'
sDes = r'D:\Downloads\Documents\161222.txt'
reaper.toText(sSrc, sDes)

In [10]:
with open(sDes, encoding='utf-8') as fl:
    sTxt = fl.read()

In [9]:
import re

In [16]:
def getFundCode(sTxt):
    m = re.search(r'基金主代码\s+?(\d{6})', sTxt)
    if m:
        return m.group(1)
    m = re.search(r'交易代码\s+?(\d{6})', sTxt)
    if m:
        return m.group(1)
    return ''
#end def

In [155]:
def getSectionRange(sTxt, sKeywd):
    iKey = sTxt.find(sKeywd)
    iBgn = sTxt.rfind('\n', 0, iKey)
    sSecHead = sTxt[iBgn+1:iKey]
    sCurSecNo = '1.0'
    sSecNoRex = r'^(\d{1,2}(?:\.\d{1,2})+)\s'
    m = re.search(sSecNoRex, sSecHead)
    if m:
        sCurSecNo = m.group(1)
    # Find (innermost) section end
    iCur = iKey
    while True:
        m = re.search(sSecNoRex, sTxt[iCur:], flags=re.MULTILINE)
        if m:
            sNxtSecNo = m.group(1)
            if sNxtSecNo.count('.') > sCurSecNo.count('.'):
                sCurSecNo = sNxtSecNo
                iCur += m.end()
            else:
                return (iKey, iCur+m.start())
        else:
            return (iKey, -1)
    return (iKey, -1)
#end def

In [106]:
def getAssetAlloc(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '资产组合情况')
    ls = re.findall(r'^[\d\s]*([^\d\s]+).+\s([\d\.-]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dAstAlloc = {}
    for t in ls:
        sType, sRatio = t
        try:
            fRatio = float(sRatio)
        except Exception:
            fRatio = 0.0
        #end try
        dAstAlloc[sType] = fRatio
        if sType == '合计':
            break
    #end for
    return dAstAlloc
#end def

In [119]:
def getIndusAlloc(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '按行业分类的股票')
    ls = re.findall(r'^([A-Z])\s.+\s([\d\.-]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dIndusAlloc = {}
    for t in ls:
        sType, sRatio = t
        try:
            fRatio = float(sRatio)
        except Exception:
            fRatio = 0.0
        #end try
        dIndusAlloc[sType] = fRatio
        if sType == 'S':
            break
    #end for
    return dIndusAlloc
#end def

In [125]:
def getStkHldngs(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '排序的前十名股票')
    ls = re.findall(r'^(\d{1,3})\s+(\d{6})\s.+\s([\d\.]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dStkHld = {}
    iPrvNo = 0
    for t in ls:
        #print(t)
        sNo, sCode, sRatio = t
        iNo = int(sNo)
        # 如果序号不连续（可以容忍跳过一个序号）
        #if (iNo > iPrvNo + 2) or (iNo < iPrvNo):
        #    break
        # 如果序号不连续
        if iNo != iPrvNo + 1:
            print('Some stock missing between {} and {}'.format(iPrvNo, iNo))
        fRatio = float(sRatio)
        dStkHld[sCode] = fRatio
        iPrvNo = iNo
    #end for
    return dStkHld
#end def

In [121]:
def getBondAlloc(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '债券品种分类')
    ls = re.findall(r'^[\d\s]*([^\d\s]+)\s.+\s([\d\.-]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dAstAlloc = {}
    for t in ls:
        #print(t)
        sType, sRatio = t
        try:
            fRatio = float(sRatio)
        except Exception:
            fRatio = 0.0
        #end try
        dAstAlloc[sType] = fRatio
        if sType == '合计':
            break
    #end for
    return dAstAlloc
#end def

In [124]:
def getBondHldngs(sTxt):
    iBgn, iEnd = getSectionRange(sTxt, '前五名债券')
    ls = re.findall(r'^(\d{1,3})\s+(\d+)\s.+\s([\d\.]+)$', sTxt[iBgn:iEnd], flags=re.MULTILINE)
    dStkHld = {}
    iPrvNo = 0
    for t in ls:
        #print(t)
        sNo, sCode, sRatio = t
        iNo = int(sNo)
        if iNo != iPrvNo + 1:
            print('Some bond missing between {} and {}'.format(iPrvNo, iNo))
        fRatio = float(sRatio)
        dStkHld[sCode] = fRatio
        iPrvNo = iNo
    #end for
    return dStkHld
#end def

In [81]:
import py.UtilFunc as util

In [86]:
import imp
imp.reload(util)

<module 'py.UtilFunc' from 'D:\\workspace\\Python\\FinDataFetcher\\py\\UtilFunc.py'>

In [156]:
def genFundDict(sTxt):
    dFund = {}
    dInfo = {}
    sFundCode = getFundCode(sTxt)
    dInfo['fund_code'] = sFundCode
    dInfo['asset_alloc'] = getAssetAlloc(sTxt)
    dInfo['indus_alloc'] = getIndusAlloc(sTxt)
    dInfo['stk_hldngs'] = getStkHldngs(sTxt)
    dInfo['bond_alloc'] = getBondAlloc(sTxt)
    dInfo['bond_hldngs'] = getBondHldngs(sTxt)
    dFund[sFundCode] = dInfo
    util.writeTupleDict(dFund, r'D:\Downloads\Documents\161222_dic.txt')

In [157]:
genFundDict(sTxt)

In [129]:
getIndusAlloc(sTxt)

{}

In [130]:
getSectionRange(sTxt, '按行业分类的股票')

(4408, 4422)

In [131]:
sTxt[4408: 4422]

'按行业分类的股票投资组合 \n'

In [149]:
m = re.search('(\d{1,2}(?:\.\d{1,2})+)\s', '3.14  ')

In [154]:
'3.14.15.9.26'.count('.')

4