def search_gzh_info(self, name, page=1): """搜索公众号 Args: name: 搜索关键字 page: 搜索的页数 Returns: 列表,每一项均是{'name':name,'wechatid':wechatid,'jieshao':jieshao,'renzhen':renzhen,'qrcode':qrcodes,'img':img,'url':url} name: 公众号名称 wechatid: 公众号id jieshao: 介绍 renzhen: 认证,为空表示未认证 qrcode: 二维码 img: 头像图片 url: 最近文章地址 """ text = self._search_gzh_text(name, page) page = etree.HTML(text) lis = page.xpath('//ul[@class="news-list2"]/li') relist = [] for li in lis: url = li.xpath('div/div[1]/a/@href') img = li.xpath('div/div[1]/a/img/@src') name = self._get_elem_text(li.xpath('div/div[2]/p[1]')[0]) info = self._get_elem_text(li.xpath('div/div[2]/p[2]')[0]) info = info.encode('utf8','ignore') #改变info编码,否则正则匹配会失败 info = re.split('微信号:|月发文|篇|平均阅读', info) #print info[1] try: wechatid = info[1] except IndexError: wechatid = '' try: post_perm = int(info[2]) except IndexError: post_perm = 0 try: read_count = int(info[3]) #将4改成了3,不知道为什么他会写成4 except IndexError: read_count = 0 qrcode = li.xpath('div/div[3]/span/img[1]/@src') jieshao = self._get_elem_text(li.xpath('dl[1]/dd')[0]) renzhen = li.xpath('dl[2]/dd/text()') pubarticle = li.xpath('dl[3]/dd/text()') #添加了一个最近发布的文章,判断公众号是否发表过文章 if(len(pubarticle)==0): pubarticle = '否' else: pubarticle = '是' relist.append({ 'url': url[0], 'img': img[0], 'name': name.replace('red_beg', '').replace('red_end', ''), 'wechatid': wechatid, 'post_perm': post_perm, 'read_count': read_count, 'qrcode': qrcode[0] if qrcode else '', 'introduction': jieshao.replace('red_beg', '').replace('red_end', ''), 'authentication': renzhen[0] if renzhen else '', 'pubarticle': pubarticle }) return relist