In [1]:
import requests, datetime, re, pymysql, time

# 爬取一天的数据

In [2]:
def getDataFromWeb(day, exchange):
    def czce(day):
        # 郑州中间变更过数据的url模板，故需要尝试两种url
        urls = ["http://www.czce.com.cn/portal/DFSStaticFiles/Future/{0}/{1}/FutureDataDaily.txt", 
                "http://www.czce.com.cn/portal/exchange/{0}/datadaily/{1}.txt"]
        for url in urls:
            url = url.format(day.year, day.strftime('%Y%m%d'))
            r = requests.get(url)
            if r.status_code==200:
                return r.text
        return None

    def dce(day):
        url = "http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html"
        postdata = {
            'dayQuotes.variety':'all', 
            'dayQuotes.trade_type':'0', 
            'year': day.year, 
            'month':day.month-1, 
            'day':day.day,
            'exportFlag':'txt'
        }
        T = 5
        while T>0:
            r = requests.post(url, data=postdata)
            if r.status_code==200:
                rawdata = r.text
                break
            T -= 1
            time.sleep(2)
        if T==0 or len(rawdata)<200:
            # 可能是非交易日，也可能是网络异常
            return None
        else:
            return rawdata
    
    def shfe(day):
        url = "http://www.shfe.com.cn/data/dailydata/kx/kx{}.dat".format(day.strftime('%Y%m%d'))
        r = requests.get(url)
        if r.status_code!=200:
            # 对于非交易日，上期所返回一个非200的状态码
            return None
        return r.json()['o_curinstrument']
        
    if exchange=='shfe':
        return shfe(day)
    elif exchange=='dce':
        return dce(day)
    elif exchange=='czce':
        return czce(day)
    else:
        assert False

# 处理原始数据

In [3]:
# 处理一天的原始数据
def process(rawdata, exchange):
    def shfe(rawdata):
        data = []
        for row in rawdata:
            deli = row['DELIVERYMONTH']
            if not re.match(r'^\d{4}$', deli):    # 去掉“小计”项
                continue
            code = row['PRODUCTID'].strip()
            if not re.match(r'^[a-z]{1,2}_f', code):
                continue
            code = code[:-2]
            settle = row['SETTLEMENTPRICE']
            _open = row['OPENPRICE']
            close = row['CLOSEPRICE']
            high = row['HIGHESTPRICE']
            low = row['LOWESTPRICE']
            volume = row['VOLUME']
            oi = row['OPENINTEREST']
            turnover = None    # 上海交易所的数据中没有每日合约的成交额
            if volume==0:
                _open = None
                high = None
                low = None
            data.append([code, deli, _open, high, low, close, settle, volume, oi, turnover])
        return data
    
    def dce(rawdata):
        data = []
        name2code = {
            '豆粕'     : 'm',
            '豆油'     : 'y',
            '豆一'     : 'a',
            '豆二'     : 'b',
            '棕榈油'   : 'p',
            '玉米'     : 'c',
            '玉米淀粉' : 'cs',
            '鸡蛋'     : 'jd',
            '胶合板'   : 'bb',
            '纤维板'   : 'fb',
            '聚乙烯'   : 'l',
            '聚氯乙烯' : 'v',
            '聚丙烯'   : 'pp',
            '焦炭'     : 'j',
            '焦煤'     : 'jm',
            '铁矿石'   : 'i',
        }
        for row in rawdata.split('\n')[1:]:
            row = row.strip()
            row = re.sub(',', '', row)    # 去掉数字中的逗号
            row = re.split('\s+', row)
            if row[0] not in name2code.keys():
                continue
            code = name2code[row[0]]
            deli = row[1]
            settle = float(row[7])
            volume = int(row[10])
            oi = int(row[11])
            turnover = float(row[13])
            high = float(row[3])
            low = float(row[4])
            _open = float(row[2])
            close = float(row[5])
            if volume==0:
                _open = None
                high = None
                low = None
            data.append([code, deli, _open, high, low, close, settle, volume, oi, turnover])
        return data
    
    def czce(rawdata):
        data = []
        rawdata = rawdata.split('\r\n')
        reg = re.compile(r'[A-Z]{2}\d{3}')
        for line in rawdata[1:]:
            # 将数据处理成列表
            line = line.split('|')
            if len(line)==1:
                line = line[0]
                line = line.split(',')
            # 去掉空白符
            line = [x.strip() for x in line]
            # 去掉数字中间的逗号
            line = [re.sub(',','',x) for x in line]
            # 筛掉非合约行
            if not reg.match(line[0]):
                continue
            code = line[0][:2].lower()
            deli = '1' + line[0][2:]
            _open = float(line[2])
            high = float(line[3])
            low = float(line[4])
            close = float(line[5])
            settle = float(line[6])
            volume = int(float(line[9]))
            oi = int(float(line[10]))
            turnover = float(line[12])
            if _open==high==low==close==0.:
                _open = None
                high = None
                low = None
                close = None
            data.append([code, deli, _open, high, low, close, settle, volume, oi, turnover])
        return data
    
    if exchange=='shfe':
        return shfe(rawdata)
    elif exchange=='dce':
        return dce(rawdata)
    elif exchange=='czce':
        return czce(rawdata)
    else:
        assert False

# 导入数据库

In [4]:
# 将一天的数据插入到数据库
def load(day, data):
    conn = pymysql.connect(host='127.0.0.1', user='root', password='19910501', db='market', charset='utf8')
    cursor = conn.cursor()
    clean_sql = "delete from contract_daily where date=%s and vari=%s and deli=%s"
    insert_sql = 'insert into contract_daily (date,vari,deli,open,high,low,close,settle,volume,oi,turnover) values (' + '%s,'*10 + '%s' + ')'
    for row in data:
        # 清理旧数据
        cursor.execute(clean_sql, tuple([day]+row[:2]))
        conn.commit()
        # 插入新数据
        cursor.execute(insert_sql, tuple([day]+row))
        conn.commit()
    cursor.close()
    conn.close()

# main函数

In [5]:
day = datetime.date(2013,1,1)
end = datetime.date.today()
step = datetime.timedelta(1)
i = 0
total = 5*365 + 1
while day<=end:
    if day.isoweekday not in [6,7]:
        rawdata = getDataFromWeb(day, 'czce')
        if rawdata:
            data = process(rawdata, 'czce')
            load(day,data)
    i += 1
    if i%50==0:
        print (1. * i / total)
    day += step
print ('finish')

0.027382256297918947
0.054764512595837894
0.08214676889375684
0.10952902519167579
0.13691128148959475
0.16429353778751368
0.19167579408543264
0.21905805038335158
0.24644030668127054
0.2738225629791895
0.30120481927710846
0.32858707557502737
0.3559693318729463
0.3833515881708653
0.41073384446878425
0.43811610076670315
0.4654983570646221
0.4928806133625411
0.52026286966046
0.547645125958379
0.5750273822562979
0.6024096385542169
0.6297918948521358
0.6571741511500547
0.6845564074479737
0.7119386637458927
0.7393209200438116
0.7667031763417306
0.7940854326396495
0.8214676889375685
0.8488499452354874
0.8762322015334063
0.9036144578313253
0.9309967141292442
0.9583789704271632
0.9857612267250822
1.013143483023001
finish
