### 排列三爬取

注： 本程序仅用于爬虫课程学习练手用，原始需求来自同公司老彩民。
- 需求：据资深老彩民转述，排列三玩法具有参与容易，数字简单的优势，比较有可能借助于数据分析的方式预测结果。
- 后期在机器学习课程内容的基础上，对号码的和值，大小，单双分别统计，并计算每年内最早出现次数，最晚出现次数，平均出现次数等。

#### 程序简介：
- 首先爬取排列三所有期的日期，期号和开奖号码
  - 信息来源： www.lottery.gov.cn/historykj/history.jspx?_ltype=pls
- 基于老彩民的意识，爬取北京日出日落时间作为辅助数据.
  - 信息来源： https://richurimo.51240.com/beijing__time__2018_12__richurimo/

#### 主要思路和体会：
##### 排列三抓取
- requests for send request and get response
- csv for output
- BeautifulSoup for parsing html
- time for time.sleep to imitate human action.
- random for generate random numbers for time.sleep
---
- 在排列三号码抓取中，使用了传统的requests作为获取html文本的信息，由于request无法执行js脚本，所以在解析文本时，是通过抓取js函数中的参数实现的。
- 在实际抓取排列三过程中，由于全部数据为252页。开始时在headers里面只填写了Accept和User_Agent，但发现爬取到20页左右就会被服务器拦截而获取response失败。所以在headers里面加入了更多的内容，使request更像人为完成的。
- 在解析网页时使用了beautifulsoup，并且尝试使用了yield关键词
---
##### 北京日出日落时间抓取
- selenium 库用于模拟浏览器，并且Driver.get会自动加载全部信息，js也在其中被执行。
- 解析的过程同样使用selenium的find_element_by_xpath
- output使用了pandas库的dataframe.to_csv(),额外查找了mode属性，可以变覆盖文档为增加内容。

#### 排列三爬取代码如下：

In [None]:
import requests
import csv
from bs4 import BeautifulSoup
import time
import random

def fetchUrl(url):
    '''
    Func: fetchUrl is used to get html page content
    Parameter:
    url : the url for the website 
    Return : req.test as page content
    '''
    try:
        #imitate browser
        headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0',
                'Cookie':'Hm_lvt_8929ffae85e1c07a7ded061329fbf441=1542295767; Hm_lpvt_8929ffae85e1c07a7ded061329fbf441=1542295773; JSESSIONID=CB0636740D359AE52357E4B8266B76E7',
                'Connection':'keep-alive',
                'Referrer':'www.google.com',
                'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                'Cache-Control':'max-age=0'
                }
        
        '''As the server is trying to identify the spider and refuse the request once identified, I found that 
           with Accept and User_agent is not enough to disguise. So I add more informations like connection, referrer
           ,accept_language and Cache-Control. Then the program is much better than before, will be identified after 80 pages
           compared with 20 pages without those fields.
        '''

        #get page content 
        req = requests.get(url,headers = headers)
        # check request status for throw exception
        req.raise_for_status() 
        #encoding the content based on the response from the url
        req.encoding=req.apparent_encoding         
        #print(req.encoding)
        #print('Fetch URL successful.')
        return req.text
    except Exception as e:
        print(e) #throw error


def output(dList):
    '''
    Func : output the result to csv file, name is added with date as a timestamp
    Parameters：
        dList:the data stored in list
    '''
    import datetime
    today = datetime.date.today().strftime('%y%m%d')
    filename = '排列三数据_'+ today + '.csv'   
    with open(filename,"a+") as csvfile: 
        writer = csv.writer(csvfile)
        writer.writerow(dList)
    
def parsePlsHtml(html):
    '''
    Func: parseHtml is used to parse HTML Doc.
    Parameters：
    '''
    try:
        soup = BeautifulSoup(html, 'lxml') # beautifulSoup as the parsing tool
        i = 0
        for item in soup.select('tr')[2:-1]: #neglect the beginning as there are headers 
            # Since Mr.Li suggest that yield is the benchmark for a python pro.
            yield{
                'date':item.select('td')[10].text,
                'number':item.select('td')[0].text,
                'digit1':item.select('td')[1].text.split()[0],
                'digit2':item.select('td')[1].text.split()[1],
                'digit3':item.select('td')[1].text.split()[2]
            }  
    except Exception as e:
        print(e)   

def main():
    Spider()
    
def Spider():
    '''
    Func:调用函数
    '''
    dList = []
    hList=['Date','No','Digit1',"Digit2","Digit3"]
    dList.append(hList)
    for page in range(1,252):
        time.sleep(random.random())
        print('Page %d'%page)
        url = 'http://www.lottery.gov.cn/historykj/history_%d.jspx?_ltype=pls'%page    
        htmlPls = fetchUrl(url)
        for item in parsePlsHtml(htmlPls):
            data = []
            data.append(item['date'])
            data.append(item['number'])
            data.append(item['digit1'])
            data.append(item['digit2'])
            data.append(item['digit3'])
            #dList.append(data)
            output(data)  
            
        
if __name__ == '__main__':
    main()

#### 爬取北京市日出日落时间代码：


In [None]:
import time
from selenium import webdriver

def openUrl(driver,url,year,month):
    '''
    Func : open the url and parse the content via selenium driver.find_elepment_by_xpath
    Parameters：
        driver: webdriver 
        dList:the data stored in list
        year: year
        month: month
    '''
    driver.get(url)    
    #imitate human beings as driver.get() has already gaurantee that the html will be loaded fully.
    time.sleep(1)
    #find element.
    element = driver.find_element_by_xpath("//tr")
    dList = []
    #split the content by space 
    dList = element.text.split('\n')
    print('Gathering data for year : %d-%d'%(year,month))
    #the first line is the header, ignore it.
    return dList[1:-1]
            
def output(dList,year,month):
    '''
    Func : output the result to csv file, name is added with date as a timestamp
    Parameters：
        dList:the data stored in list
        year: year
        month: month
    '''
    import pandas as pd
    import datetime
    today = datetime.date.today().strftime('%y%m%d')
    filename = 'BeijingSunRiseSunDown_'+ today + '.csv'
    for data in dList:
        # use for loop as the dList contains the data for a month, need to separate, otherwise all 
        # the data will be written in one line, not the best.
        dataframe = pd.DataFrame(data)
        dataframe.to_csv(filename,index=False,sep=',',header=False,mode = 'a') 
        #mode 'a' means add data instead of overriding. 
    print('data has been added of %d-%d'%(year,month))
        
def spider():
    '''
    Func : main function 
    '''
    #A little interaction
    startYear = int(input('please input start year'))
    print("\n")
    endYear = int(input('please input end year'))
    # to make the browser invisible by add headless in options, otherwise it will keep popup and refresh.
    chromeOptions = webdriver.ChromeOptions() 
    chromeOptions.add_argument("headless")
    driver = webdriver.Chrome(chrome_options=chromeOptions)
    #driver = webdriver.Chrome() # open Chrome webdriver
    place = "beijing"
    data = []
    
    for year in range(startYear,endYear+1):
        for month in range(1,13):
            url = "https://richurimo.51240.com/%s__time__%d_%d__richurimo/"%(place,year,month)
            data.append(openUrl(driver,url,year,month))# get data from html
            output(data,year,month) # output data to csv file.
        
def main():
    spider()
    
if __name__ == "__main__":
    main()