In [1]:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import re
from pyecharts import Pie,Bar

### 先用正则匹配出 法院公告的每个星期的请求URL

In [2]:
URLs = []
for i in range(1,7):
    URL = 'http://xnqfy.chinacourt.gov.cn/article/index/id/M0g3NzAwNjAwMiACAAA/page/{}.shtml'.format(i)
    html = requests.get(URL)
    text = html.text
    candidate = re.findall('/article/detail/[0-9]{4}/[0-9]{2}/id/[0-9]{7}.shtml', html.text )
    URLs.append(candidate)
    
URLs = sum(URLs, [])
    
URLs = ['http://xnqfy.chinacourt.gov.cn' + url for url in URLs] 
    

### 解析请求URL，两种解析架构，分别对应新旧不同类型的网站架构

In [3]:
def parse_url_1(url):
    html = requests.get(url)
    soup = bs(html.text)

    data = []
    for line in soup.select('p'):
        data.append(line.text)

    items = []
    while len(data) > 0:
        try :
            ind = data.index('\xa0')
            items.append(data[:ind])
            data = data[ind + 1:]
        except:
            break

    while len(items[-1]) == 0:
        items.pop()

    DFs = []

    for item in items:
        data = []
        columns = []

        for element in item:
            element = element.replace(":", "：")
            key = element.split('：')[0]
            value = element.split('：')[-1]
            columns.append(key)
            data.append(value)
        data = np.array(data).reshape(1,len(columns))
        df = pd.DataFrame(data = data , columns = columns)
        DFs.append(df)

    df = pd.concat(DFs)
    df = df[['案号','案由','开庭时间']]
    return df


def parse_url_2(url):
    html = requests.get(url)
    items = re.findall("[\w]+：（?[\w]+）?[\w]*",html.text.replace('\u3000',''))
    caseIDs = []
    causes = []
    dates = []
    for item in items:
        if '案号' in item:
            caseIDs.append(item)
        if "案由" in item:
            causes.append(item)
        if '开庭时间' in item:
            dates.append(item)
            
    df = pd.DataFrame({'案号':caseIDs,"案由": causes, '开庭时间': dates})
    return df
    


### 对每个星期的公告进行两种不同对解析

In [4]:
DFs = []
for url in URLs:
    try:
        df = parse_url_1(url)
        DFs.append(df)
    except:
        pass
    try:
        df = parse_url_2(url)
        DFs.append(df)
    except:
        pass
        



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


### 把数据结果汇总， 并去重。因为部分页面解析了两次，得到对数据会重复

In [5]:
df = pd.concat(DFs)
df = df.dropna()
df = df.reset_index()

df = df.drop_duplicates()

### 数据清洗

In [6]:
def clear_data(data):
    return data.split('：')[-1]

df['开庭时间'] = df['开庭时间'].apply(clear_data)
df['案号'] = df['案号'].apply(clear_data)
df['案由'] = df['案由'].apply(clear_data)

In [7]:
def marry_yyyymm(date):
    try:
        yyyymm = re.findall('[0-9]{4}年[0-9]+月', date)[0]
        if len(yyyymm) == 7:
            yyyymm = yyyymm[:5] + '0' + yyyymm[5:]
            
        return yyyymm
    except:
        return 'null'
    
df['开庭年月'] = df['开庭时间'].apply(marry_yyyymm)

df['开庭年月'] = df['开庭年月'].apply(lambda yyyymm : yyyymm.replace('2014年004月', '2014年04月' ))
df['开庭年月'] = df['开庭年月'].apply(lambda yyyymm : yyyymm.replace('2013年00月', '2013年10月' ))

df['案由类型'] = df['案由'].apply(lambda cause: cause.replace('纠纷',""))

In [None]:
m, n = df.shape
for i in range(m):
    for j in range(n):
        try:
            if '\xa0' in df.iloc[i,j]:
                df.iloc[i,j] = df.iloc[i,j].replace('\xa0', '')
        except:
            pass

df.to_csv('data.csv' , index = None, encoding = 'gbk')

### 数据分析并可视化

In [8]:
# 案由类型
keys = df['案由类型'].value_counts().keys()[:15]
def label_cause(cause):
    if cause in keys:
        return cause
    else:
        return "其他"
    
df['label_案由类型'] = df['案由类型'].apply(label_cause)

attr = df['label_案由类型'].value_counts().keys()
value = df['label_案由类型'].value_counts()

pie = Pie('纠纷类型', title_pos = 'center' ,width = 900, height = 600)
pie.add('', attr , value ,
        center = [50, 50],
       is_label_show = True,
       is_legend_show = True,
       legend_orient ="vertical",
       legend_pos = "right",
       radius=[40, 75])
pie.render()