# 1. 爬虫

In [1]:
import requests
import numpy as np
import pandas as pd
import cufflinks as cf
from bs4 import BeautifulSoup

In [2]:
url = 'https://zst.cjcp.com.cn/cjwk3/view/kuai3_zonghe-js-3-5000.html'

In [3]:
response=requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
trs = soup.select('tbody#pagedata tr')

In [6]:
records = []
for tr in trs:
    issue = tr.select('td.z_bg_05')[1].get_text()
    lottery_number = tr.select('td.z_bg_13')[0].get_text()
    sum_number = tr.select('td.bg3')[0].get_text()
    big_or_small = '小' if int(sum_number) <=10 else '大'
    parity = '偶数' if int(sum_number)%2 == 0 else '奇数'
#     print(issue, lottery_number, sum_number, big_or_small, parity)
    records.append([issue, lottery_number, sum_number, big_or_small, parity])

In [7]:
records = pd.DataFrame(records, columns=['期号', '号码', '和值', '大小', '奇偶'])

In [8]:
records['和值'] = records['和值'].astype(float)

In [9]:
records.head()

Unnamed: 0,期号,号码,和值,大小,奇偶
0,20200419001,144,9.0,小,奇数
1,20200419002,123,6.0,小,偶数
2,20200419003,111,3.0,小,奇数
3,20200419004,256,13.0,大,奇数
4,20200419005,115,7.0,小,奇数


In [10]:
records.tail()

Unnamed: 0,期号,号码,和值,大小,奇偶
4995,20200818037,125,8.0,小,偶数
4996,20200818038,136,10.0,小,偶数
4997,20200818039,146,11.0,大,奇数
4998,20200818040,146,11.0,大,奇数
4999,20200818041,144,9.0,小,奇数


In [11]:
records.dtypes

期号     object
号码     object
和值    float64
大小     object
奇偶     object
dtype: object

In [12]:
# records.to_excel(r'D:\浏览器下载\最近5000期数据.xlsx')

 # 2. 数据分析报告

数值列摘要

In [13]:
records.describe()

Unnamed: 0,和值
count,5000.0
mean,10.486
std,2.950041
min,3.0
25%,8.0
50%,11.0
75%,13.0
max,18.0


非数值列摘要

In [14]:
records[['大小', '奇偶']].describe()

Unnamed: 0,大小,奇偶
count,5000,5000
unique,2,2
top,大,偶数
freq,2517,2546


## 2.1 和值

频数分布及占比

In [15]:
a = records['和值'].value_counts()
b= records['和值'].value_counts(normalize=True)

result = pd.concat([a, b], axis=1).sort_index()
result.columns = ['频数', '占比']
result['占比'] = result['占比'].map(lambda x: format(x, '.1%'))

In [16]:
result

Unnamed: 0,频数,占比
3.0,25,0.5%
4.0,75,1.5%
5.0,145,2.9%
6.0,222,4.4%
7.0,331,6.6%
8.0,516,10.3%
9.0,545,10.9%
10.0,624,12.5%
11.0,641,12.8%
12.0,606,12.1%


In [17]:
records['和值'].value_counts().iplot(kind='bar', xTitle='和值', yTitle='频数', title='近5000期中各和值出现的频数',
                                     theme='white', colorscale='set1')

## 2.2 大小

In [18]:
c = records['大小'].value_counts()
d= records['大小'].value_counts(normalize=True)

result = pd.concat([c, d], axis=1).sort_index()
result.columns = ['频数', '占比']
result['占比'] = result['占比'].map(lambda x: format(x, '.1%'))

In [19]:
result

Unnamed: 0,频数,占比
大,2517,50.3%
小,2483,49.7%


In [20]:
records['大小'].value_counts().iplot(kind='bar', xTitle='大小', yTitle='频数', title='近5000期中大小值出现的频数',
                                     theme='white', colorscale='-set1')

## 2.3奇偶

In [21]:
records['奇偶'].value_counts()

偶数    2546
奇数    2454
Name: 奇偶, dtype: int64

In [22]:
e = records['奇偶'].value_counts()
f = records['奇偶'].value_counts(normalize=True)

result = pd.concat([e, f], axis=1).sort_index()
result.columns = ['频数', '占比']
result['占比'] = result['占比'].map(lambda x: format(x, '.1%'))

In [23]:
result

Unnamed: 0,频数,占比
偶数,2546,50.9%
奇数,2454,49.1%


In [24]:
records['奇偶'].value_counts().iplot(kind='bar', xTitle='奇偶', yTitle='频数', title='近5000期中奇偶值出现的频数',
                                     theme='white')