In [1]:
import re
import time
import datetime
import pymongo
from tqdm import tqdm
import copy
from loguru import logger

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException,TimeoutException

In [2]:
class Tweet:

    def __init__(self, query, uid, ptime, pcontent, padditional, nb_reply, nb_retweet, nb_favorite):
        self.query = query
        self.uid = uid
        self.ptime = ptime
        self.pcontent = pcontent
        self.padditional = padditional  # 转发推文，文章链接，图片，视频
        self.nb_retweet = nb_retweet  # nbr of retweet
        self.nb_favorite = nb_favorite  # nbr of favorite
        self.nb_reply = nb_reply    # nbr of reply

    def __repr__(self):
        return "Tweet={}\nQuery={}".format(self.pcontent, self.query)


class User:

    def __init__(self, profile_url):
        self.profile_url = profile_url
        self.ID = profile_url.split('/')[-1]
        self.name = ''
        self.avatar = ''
        self.query = ''# query相关的大V用户
        self.intro = ''
        

    def __repr__(self):
        return "User {}".format(self.ID)


def compare_time(time1,time2):
    s_time = time.mktime(time.strptime(time1,'%Y年%m月%d日'))
    e_time = time.mktime(time.strptime(time2,'%Y年%m月%d日'))
    return int(s_time) - int(e_time)
    
def convert_time(x):
    '''
    for x in ['20分钟','1小时','1天', '10月10日','2018年10月1日']:
        print(convert_time(x))
    '''
    now = datetime.datetime.now()
    pattern = r'\d{4}年\d+月\d+日'
    if re.match(pattern, x):
        return x
    pattern = r'\d+月\d+日'
    if re.match(pattern, x):
        return "{}年".format(now.year)+x
    return "{}年{}月{}日".format(now.year, now.month, now.day)

def is_non_result(browser):
    '''
    判断结果是否为空
    '''
#     result_div_xpath = "//div[@id='react-root']"
#     wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
#     try:
#         result_div = browser.find_element_by_xpath(result_div_xpath)
#         return "未找到结果" in result_div.text
#     except NoSuchElementException as e:
    return "未找到结果" in browser.find_element_by_tag_name('body').text

def get_search_input_v1(browser):
    # 定位搜索框
    search_input_xpath = "//input[@placeholder='搜索 Twitter']"
    wait.until(EC.presence_of_element_located((By.XPATH, search_input_xpath)))
    search_input = browser.find_element_by_xpath(search_input_xpath)
    return search_input

# def get_search_input_v2(browser):
#     # 请求主站
#     browser.get('https://twitter.com/search-home')
#     # 定位搜索框
#     search_input_id = 'search-home-input'
#     wait.until(EC.presence_of_element_located((By.ID, search_input_id)))
#     search_input = browser.find_element_by_id(search_input_id)
#     return search_input


def extract_reply_retweet_favorite(element):
    t = []
    for x in element.find_elements_by_xpath('./div')[:3]:
        if x.text.strip() == '':
            t.append(0)
        else:
            t.append(int(x.text.strip()))
    return tuple(t)


#### query -> 推文爬取

In [23]:
def parse_tweet_result_div(result_div,query):
    count = 0
    for div in result_div:
        user, tweet = div.find_elements_by_xpath('./div')
        profile_url = user.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        a, *b_c, d = tweet.find_elements_by_xpath('./div')  # 按照div分为>=3层
        ptime = a.find_elements_by_tag_name('a')[-1].text
        ptime = convert_time(ptime)
        nb_reply, nb_retweet, nb_favorite = 0,0,0
        try:
            nb_reply, nb_retweet, nb_favorite = extract_reply_retweet_favorite(
                d)
        except:
            nb_reply, nb_retweet, nb_favorite = 0, 0, 0
        pcontent = b_c[0].text
        padditional = []
        if len(b_c) > 1:
            for x in b_c[1:]:
                try:
                    a = x.find_element_by_tag_name('a').get_attribute('href')
                    padditional.append(a)
                except NoSuchElementException as e:
                    padditional.append(x.text.strip())
        user = User(profile_url)
        tweet = Tweet(query, uid, ptime, pcontent, padditional,
                      nb_reply, nb_retweet, nb_favorite)
        # save to databse
        user_dict = user.__dict__
        user_dict['_id']=user_dict['ID']
        if user_table.update_one({'_id': user_dict['_id']}, {'$set': user_dict}, upsert=True) and tweet_table.insert_one(tweet.__dict__):
            count += 1
    return count

def crawl_tweet(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="tweet"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    # 解析结果
    count += parse_tweet_result_div(result_div,query)
    while count < MAX_TWEET_SIZE:
#         logger.info("{}/{}".format(count,MAX_TWEET_SIZE))
        result_div_xpath = '//div[@data-testid="tweet"]'
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        try:
            count += parse_tweet_result_div(result_div,query)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        try_times = 0
        old_height = browser.execute_script("return document.body.scrollHeight;")
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] == last_div:
                try_times += 1
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break
            time.sleep(3)
            new_height = browser.execute_script("return document.body.scrollHeight;")
            if old_height == new_height:
                try_times += 1
                last_div = result_div[-1]
            if try_times >= 3:
                count = MAX_TWEET_SIZE # 到头了停止翻页采集该query
                print('到头了')
                break

def search_tweet_from_query(browser,query_list,finish_query_list):
    '''
    更加query采集推文
    '''
    for query in tqdm(query_list):
        logger.info('query = {}'.format(query))
        browser.get('https://twitter.com/explore')

        # 定位搜索框
        if browser.current_url == 'https://twitter.com/explore':
            search_input = get_search_input_v1(browser)
        else:
            print('error')
            return
        # 搜索query
        search_input.clear()
        search_input.send_keys(query)
        search_input.send_keys(Keys.ENTER)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(1)
        try:
            crawl_tweet(browser, query)
        except TimeoutException as e:
            print('TimeoutException')
            continue
        finish_query_list.append(query)
    print(bad_query_list)


#### query -> 爬取相关用户

In [20]:
def search_user_from_query(browser,query_list,finish_query_list):
    '''
    根据query采集maxsize用户列表
    '''
    for query in tqdm(query_list):
        logger.info('query = {}'.format(query))
        browser.get('https://twitter.com/explore')

        # 定位搜索框
        if browser.current_url == 'https://twitter.com/explore':
            search_input = get_search_input_v1(browser)
        else:
            print('error')
            return
        # 搜索query
        search_input.clear()
        search_input.send_keys(query)
        search_input.send_keys(Keys.ENTER)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(2)
        # 请求用户结果页面
        browser.get(browser.current_url + '&f=user')
        
        try:
            crawl_user(browser,query)
        except TimeoutException as e:
            print('TimeoutException')
            continue
        finish_query_list.append(query)
    
    print(bad_query_list)

def parse_user_result_div(result_div,query):
    count = 0
    for t in result_div:
        left, right = t.find_elements_by_xpath('./div/div')
        profile_url = left.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        print('parsing uid = {}'.format(uid))
        avatar = left.find_element_by_tag_name('img').get_attribute('src')
        a, *b = right.find_elements_by_xpath('./div')  # 按照div分为>=3层
        uname= a.text.split('\n')[0]
        intro = ''
        if len(b)!=0:
            intro = b[-1].text.strip()
        
        user = User(profile_url)
        user.ID = uid
        user.avatar = avatar
        user.name = uname
        user.intro = intro
        user.query = query
        
        user_dict = user.__dict__
        user_dict['_id']=user_dict['ID']

        # save to databse
        if user_table.update_one({'_id': user_dict['_id']}, {'$set': user_dict}, upsert=True):
            count += 1
    return count

def crawl_user(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="UserCell"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    count += parse_user_result_div(result_div,query)
    
    while count < MAX_USER_SIZE:
#         logger.info("{}/{}".format(count,MAX_USER_SIZE))
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        
        try:
            count += parse_user_result_div(result_div,query)
            time.sleep(2)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        try_times = 0
        old_height = browser.execute_script("return document.body.scrollHeight;")
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] == last_div:
                try_times += 1
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break
            time.sleep(3)
            new_height = browser.execute_script("return document.body.scrollHeight;")
            if old_height == new_height:
                try_times += 1
                last_div = result_div[-1]
            if try_times >= 3:
                count = MAX_TWEET_SIZE # 到头了停止翻页采集该query
                print('到头了')
                break

#### 爬取特定用户的推文（时间间隔内）

In [36]:
def search_tweet_from_profile_v2(browser,query_user_list,finish_user_list):
    for u in tqdm(query_user_list):
        logger.info('user = {}'.format(u['_id']))
        user_profile = 'https://twitter.com/'+ u['_id']
        browser.get(user_profile)
        query = u['query']

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(1)
        try:
            crawl_tweet2(browser,query)
        except TimeoutException as e:
            print('TimeoutException')
            continue
        
        finish_user_list.append(user)
    print(bad_query_list)

def crawl_tweet2(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="tweet"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    # 解析结果
    count += parse_tweet_from_profile(result_div,query)
    while count < MAX_TWEET_SIZE:
#         logger.info("{}/{}".format(count,MAX_TWEET_SIZE))
        result_div_xpath = '//div[@data-testid="tweet"]'
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        try:
            count += parse_tweet_from_profile(result_div,query)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        try_times = 0
        old_height = browser.execute_script("return document.body.scrollHeight;")
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] == last_div:
                try_times += 1
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break
            time.sleep(3)
            new_height = browser.execute_script("return document.body.scrollHeight;")
            if old_height == new_height:
                try_times += 1
                last_div = result_div[-1]
            if try_times >= 3:
                count = MAX_TWEET_SIZE # 到头了停止翻页采集该query
                print('到头了')
                break

                
def parse_tweet_from_profile(result_div,query):
    count = 0
    top = 0 # 置顶是个数目
    # 如果存在置顶推文则不考虑时间
    try:
        t = browser.find_elements_by_xpath('//div[@class="css-1dbjc4n r-1habvwh r-1iusvr4 r-16y2uox r-5f2r5o"]')
        top = len(t)
    except NoSuchElementException as e:
        pass
    
    for div in result_div:
        user, tweet = div.find_elements_by_xpath('./div')
        profile_url = user.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        a, *b_c, d = tweet.find_elements_by_xpath('./div')  # 按照div分为>=3层
        ptime = a.find_elements_by_tag_name('a')[-1].text
        ptime = convert_time(ptime)
        
        # 无置顶推文则按照时间过滤
        top -= 1
        if top<0 and compare_time(ptime,time_interval) < 0:
            print('触发时间截止')
            return MAX_TWEET_SIZE # 使其直接达到数目规模，从而停止外层循环
        
        nb_reply, nb_retweet, nb_favorite = 0,0,0
        try:
            nb_reply, nb_retweet, nb_favorite = extract_reply_retweet_favorite(
                d)
        except:
            nb_reply, nb_retweet, nb_favorite = 0, 0, 0
        pcontent = b_c[0].text
        padditional = []
        if len(b_c) > 1:
            for x in b_c[1:]:
                try:
                    a = x.find_element_by_tag_name('a').get_attribute('href')
                    padditional.append(a)
                except NoSuchElementException as e:
                    padditional.append(x.text.strip())
        tweet = Tweet(query, uid, ptime, pcontent, padditional,
                      nb_reply, nb_retweet, nb_favorite)

        if tweet2_table.insert_one(tweet.__dict__):
            count += 1
    return count

#### 启动浏览器并登陆

In [7]:
client = pymongo.MongoClient("mongodb://10.108.17.25:27017/")
twitter_db = client["twitter_v2"]
user_table = twitter_db['user']
tweet_table = twitter_db['tweet_by_query']
tweet2_table = twitter_db['tweet_by_user']

In [21]:
# 打开浏览器
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 100)

# 人工登录
browser.get('https://twitter.com/')

In [9]:
browser.refresh()
time.sleep(2)

##### 采集推文

In [11]:
import pandas as pd

df = pd.read_csv('./projects.csv',encoding='utf-8')
df.columns = ['Project','Country','Type']
query_list = [x.strip() for x in df['Project'].tolist() if len(x.split()) <= 5]
query_list = query_list

# df = pd.read_csv('./policies.csv',encoding='utf-8')
# df.columns = ['P','_','__']
# query_list = [x.strip() for x in df['P'].tolist() if len(x.split()) <= 10]
# query_list = query_list
len(query_list)

108

In [14]:
# 第1步
finish_query_list = []
bad_query_list = []
MAX_TWEET_SIZE = 1000
search_tweet_from_query(browser,query_list,finish_query_list)

# 第2步
# MAX_TWEET_SIZE = 1000
# special_list = [
#     "the belt and road",
#     'One Belt one road',
#     "the Silk Road",
#     'the Silk Road Economic Belt',
#     'Belt and Road Initiative',
#     '21st Century Maritime Silk Road',
#     'Spirit of the Silk Road',
#     'Silk Road Fund',
#     'Silk Road of Green Development'
# ]
# search_tweet_from_query(browser,special_list,finish_query_list)

  0%|          | 0/108 [00:00<?, ?it/s]2019-12-30 23:25:20.596 | INFO     | __main__:search_tweet_from_query:82 - query = Padma Rail Link
  1%|          | 1/108 [00:24<43:19, 24.30s/it]2019-12-30 23:25:44.892 | INFO     | __main__:search_tweet_from_query:82 - query = Lower Sesan Two Hydropower Dam


到头了


  2%|▏         | 2/108 [02:07<1:24:42, 47.95s/it]2019-12-30 23:27:28.043 | INFO     | __main__:search_tweet_from_query:82 - query = Central Asia–China gas pipeline


TimeoutException


  3%|▎         | 3/108 [02:32<1:11:50, 41.05s/it]2019-12-30 23:27:52.989 | INFO     | __main__:search_tweet_from_query:82 - query = Doraleh Multi-Purpose Port


到头了


  4%|▎         | 4/108 [03:01<1:04:58, 37.48s/it]2019-12-30 23:28:22.154 | INFO     | __main__:search_tweet_from_query:82 - query = Khorgos Gateway Dry Port


到头了


  5%|▍         | 5/108 [03:23<56:09, 32.72s/it]  2019-12-30 23:28:43.747 | INFO     | __main__:search_tweet_from_query:82 - query = Forest City


到头了


  6%|▌         | 6/108 [12:54<5:30:22, 194.34s/it]2019-12-30 23:38:15.201 | INFO     | __main__:search_tweet_from_query:82 - query = Melaka Gateway
  6%|▋         | 7/108 [13:44<4:14:00, 150.89s/it]2019-12-30 23:39:04.721 | INFO     | __main__:search_tweet_from_query:82 - query = Pakistan-China – Fiber Optic Project


到头了


  7%|▋         | 8/108 [14:08<3:08:10, 112.90s/it]2019-12-30 23:39:28.976 | INFO     | __main__:search_tweet_from_query:82 - query = Diamer-Bhasha Dam


到头了


  8%|▊         | 9/108 [15:36<2:54:05, 105.51s/it]2019-12-30 23:40:57.239 | INFO     | __main__:search_tweet_from_query:82 - query = Gwadar Port


到头了


  9%|▉         | 10/108 [20:31<4:25:17, 162.42s/it]2019-12-30 23:45:52.464 | INFO     | __main__:search_tweet_from_query:82 - query = Belgrade-Montenegro Bar Port Motorway


到头了


 10%|█         | 11/108 [22:15<3:54:11, 144.86s/it]2019-12-30 23:47:36.341 | INFO     | __main__:search_tweet_from_query:82 - query = Sino-Thai – High-Speed Railway


TimeoutException


 11%|█         | 12/108 [22:37<2:52:39, 107.92s/it]2019-12-30 23:47:58.056 | INFO     | __main__:search_tweet_from_query:82 - query = Colombo South Harbour


到头了


 12%|█▏        | 13/108 [23:01<2:10:58, 82.72s/it] 2019-12-30 23:48:21.990 | INFO     | __main__:search_tweet_from_query:82 - query = Port City Colombo


到头了


 13%|█▎        | 14/108 [24:03<1:59:53, 76.52s/it]2019-12-30 23:49:24.044 | INFO     | __main__:search_tweet_from_query:82 - query = Hambantota Port


到头了


 14%|█▍        | 15/108 [25:50<2:12:53, 85.74s/it]2019-12-30 23:51:11.281 | INFO     | __main__:search_tweet_from_query:82 - query = Single Gauge Trans-Asian Railway


到头了


 15%|█▍        | 16/108 [27:34<2:19:43, 91.12s/it]2019-12-30 23:52:54.972 | INFO     | __main__:search_tweet_from_query:82 - query = Karuma Hydropower Project


TimeoutException


 16%|█▌        | 17/108 [28:09<1:52:37, 74.26s/it]2019-12-30 23:53:29.873 | INFO     | __main__:search_tweet_from_query:82 - query = Pap Angren Railway


到头了


 17%|█▋        | 18/108 [28:29<1:27:13, 58.15s/it]2019-12-30 23:53:50.447 | INFO     | __main__:search_tweet_from_query:82 - query = Budapest–Belgrade Railway


到头了


 18%|█▊        | 19/108 [28:58<1:13:05, 49.27s/it]2019-12-30 23:54:18.998 | INFO     | __main__:search_tweet_from_query:82 - query = Yamal LNG Project


到头了


 19%|█▊        | 20/108 [29:42<1:10:10, 47.85s/it]2019-12-30 23:55:03.523 | INFO     | __main__:search_tweet_from_query:82 - query = Tehran-Mashhad Railway


到头了


 19%|█▉        | 21/108 [30:02<57:15, 39.49s/it]  2019-12-30 23:55:23.519 | INFO     | __main__:search_tweet_from_query:82 - query = Lagos-Calabar Railway


到头了


 20%|██        | 22/108 [30:29<51:09, 35.70s/it]2019-12-30 23:55:50.361 | INFO     | __main__:search_tweet_from_query:82 - query = Lagos-Kano Railway


到头了


 21%|██▏       | 23/108 [30:56<46:40, 32.95s/it]2019-12-30 23:56:16.896 | INFO     | __main__:search_tweet_from_query:82 - query = Chad-Cameroon & Chad-Sudan Railway


到头了


 22%|██▏       | 24/108 [32:39<1:15:45, 54.12s/it]2019-12-30 23:58:00.413 | INFO     | __main__:search_tweet_from_query:82 - query = Addis Ababa Light Rail


TimeoutException


 23%|██▎       | 25/108 [33:13<1:06:29, 48.07s/it]2019-12-30 23:58:34.376 | INFO     | __main__:search_tweet_from_query:82 - query = Benguela Railway


到头了


 24%|██▍       | 26/108 [33:43<58:12, 42.59s/it]  2019-12-30 23:59:04.163 | INFO     | __main__:search_tweet_from_query:82 - query = Abuja-Kaduna Railway


到头了


 25%|██▌       | 27/108 [34:21<55:37, 41.20s/it]2019-12-30 23:59:42.123 | INFO     | __main__:search_tweet_from_query:82 - query = Khartoum-Port Sudan Railway


到头了


 26%|██▌       | 28/108 [34:35<43:52, 32.91s/it]2019-12-30 23:59:55.682 | INFO     | __main__:search_tweet_from_query:82 - query = Djibouti-Ethiopia Railway


到头了


 27%|██▋       | 29/108 [34:57<39:12, 29.78s/it]2019-12-31 00:00:18.174 | INFO     | __main__:search_tweet_from_query:82 - query = Vientane-Boten Railway


到头了


 28%|██▊       | 30/108 [36:41<1:07:27, 51.89s/it]2019-12-31 00:02:01.636 | INFO     | __main__:search_tweet_from_query:82 - query = Savannakhet-Lao Bao Railway


TimeoutException


 29%|██▊       | 31/108 [36:55<52:05, 40.59s/it]  2019-12-31 00:02:15.870 | INFO     | __main__:search_tweet_from_query:82 - query = Bangkok-Nong Khai Railway


到头了


 30%|██▉       | 32/108 [37:15<43:44, 34.53s/it]2019-12-31 00:02:36.268 | INFO     | __main__:search_tweet_from_query:82 - query = Bangkok-Chiang Mai Railway


到头了


 31%|███       | 33/108 [37:37<38:20, 30.68s/it]2019-12-31 00:02:57.956 | INFO     | __main__:search_tweet_from_query:82 - query = Kuala Lumpur-Singapore High Speed Rail


到头了


 31%|███▏      | 34/108 [38:03<36:04, 29.26s/it]2019-12-31 00:03:23.893 | INFO     | __main__:search_tweet_from_query:82 - query = Jakarta-Bandung Railway


到头了


 32%|███▏      | 35/108 [38:52<43:00, 35.34s/it]2019-12-31 00:04:13.443 | INFO     | __main__:search_tweet_from_query:82 - query = East Coast Railway


到头了


 33%|███▎      | 36/108 [40:50<1:11:56, 59.95s/it]2019-12-31 00:06:10.821 | INFO     | __main__:search_tweet_from_query:82 - query = Gemas-Johor Bahru Railway


到头了


 34%|███▍      | 37/108 [41:09<56:35, 47.82s/it]  2019-12-31 00:06:30.319 | INFO     | __main__:search_tweet_from_query:82 - query = Dawei Port


到头了


 35%|███▌      | 38/108 [41:34<47:46, 40.94s/it]2019-12-31 00:06:55.224 | INFO     | __main__:search_tweet_from_query:82 - query = Gujarat Rural Roads (MMGSY) Project


到头了


 36%|███▌      | 39/108 [43:18<1:08:41, 59.74s/it]2019-12-31 00:08:38.810 | INFO     | __main__:search_tweet_from_query:82 - query = Nurek Hydropower Rehabilitation Project


TimeoutException


 37%|███▋      | 40/108 [43:31<51:51, 45.75s/it]  2019-12-31 00:08:51.926 | INFO     | __main__:search_tweet_from_query:82 - query = Batumi Bypass Road Project


到头了


 38%|███▊      | 41/108 [43:45<40:22, 36.15s/it]2019-12-31 00:09:05.680 | INFO     | __main__:search_tweet_from_query:82 - query = Natural Gas Project


到头了


 39%|███▉      | 42/108 [49:24<2:19:56, 127.22s/it]2019-12-31 00:14:45.397 | INFO     | __main__:search_tweet_from_query:82 - query = Tarbela 5 Hydropower Extension Project


到头了


 40%|███▉      | 43/108 [49:44<1:42:59, 95.06s/it] 2019-12-31 00:15:05.423 | INFO     | __main__:search_tweet_from_query:82 - query = M4 Motorway


到头了


 41%|████      | 44/108 [51:13<1:39:11, 93.00s/it]2019-12-31 00:16:33.610 | INFO     | __main__:search_tweet_from_query:82 - query = Dushanbe-Uzbekistan Border Road Improvement


到头了


 42%|████▏     | 45/108 [51:28<1:13:18, 69.81s/it]2019-12-31 00:16:49.310 | INFO     | __main__:search_tweet_from_query:82 - query = Nenskra Hydropower Plant


到头了


 43%|████▎     | 46/108 [51:50<57:18, 55.45s/it]  2019-12-31 00:17:11.269 | INFO     | __main__:search_tweet_from_query:82 - query = Amaravati Sustainable Capital City


到头了


 44%|████▎     | 47/108 [52:11<45:47, 45.03s/it]2019-12-31 00:17:31.986 | INFO     | __main__:search_tweet_from_query:82 - query = Madhya Pradesh Rural Connectivity Project


到头了


 44%|████▍     | 48/108 [52:33<38:02, 38.04s/it]2019-12-31 00:17:53.693 | INFO     | __main__:search_tweet_from_query:82 - query = Mumbai Metro Line 4


到头了


 45%|████▌     | 49/108 [53:20<40:03, 40.74s/it]2019-12-31 00:18:40.730 | INFO     | __main__:search_tweet_from_query:82 - query = Sahiwal 2x660MW Coal-fired Power Plant


到头了


 46%|████▋     | 50/108 [53:40<33:19, 34.48s/it]2019-12-31 00:19:00.612 | INFO     | __main__:search_tweet_from_query:82 - query = UEP 100MW Wind Farm


到头了


 47%|████▋     | 51/108 [53:54<26:58, 28.39s/it]2019-12-31 00:19:14.785 | INFO     | __main__:search_tweet_from_query:82 - query = Sachal 50MW Wind Farm


到头了


 48%|████▊     | 52/108 [54:14<24:10, 25.91s/it]2019-12-31 00:19:34.911 | INFO     | __main__:search_tweet_from_query:82 - query = Peshawar-Karachi Motorway


到头了


 49%|████▉     | 53/108 [54:50<26:41, 29.12s/it]2019-12-31 00:20:11.537 | INFO     | __main__:search_tweet_from_query:82 - query = Havelian Dry Port


到头了


 50%|█████     | 54/108 [55:20<26:18, 29.23s/it]2019-12-31 00:20:41.026 | INFO     | __main__:search_tweet_from_query:82 - query = Gwadar International Airport


到头了


 51%|█████     | 55/108 [56:02<29:09, 33.00s/it]2019-12-31 00:21:22.826 | INFO     | __main__:search_tweet_from_query:82 - query = Myitsone Dam


到头了


 52%|█████▏    | 56/108 [56:53<33:23, 38.53s/it]2019-12-31 00:22:14.245 | INFO     | __main__:search_tweet_from_query:82 - query = Balloki Power Plant


到头了


 53%|█████▎    | 57/108 [57:41<35:01, 41.20s/it]2019-12-31 00:23:01.694 | INFO     | __main__:search_tweet_from_query:82 - query = Gadani Power Project


到头了


 54%|█████▎    | 58/108 [58:06<30:23, 36.46s/it]2019-12-31 00:23:27.086 | INFO     | __main__:search_tweet_from_query:82 - query = Hakla–Dera Ismail Khan Motorway


到头了


 55%|█████▍    | 59/108 [58:28<26:08, 32.00s/it]2019-12-31 00:23:48.689 | INFO     | __main__:search_tweet_from_query:82 - query = Khunjerab Railway


到头了


 56%|█████▌    | 60/108 [58:47<22:28, 28.09s/it]2019-12-31 00:24:07.631 | INFO     | __main__:search_tweet_from_query:82 - query = M5 Motorway


到头了


 56%|█████▋    | 61/108 [1:00:31<40:00, 51.07s/it]2019-12-31 00:25:52.320 | INFO     | __main__:search_tweet_from_query:82 - query = M8 Motorway


到头了


 57%|█████▋    | 62/108 [1:01:02<34:30, 45.02s/it]2019-12-31 00:26:23.229 | INFO     | __main__:search_tweet_from_query:82 - query = Matiari–Lahore Transmission Line


到头了


 58%|█████▊    | 63/108 [1:01:26<29:00, 38.67s/it]2019-12-31 00:26:47.099 | INFO     | __main__:search_tweet_from_query:82 - query = Orange Line Lahore Metro


到头了


 59%|█████▉    | 64/108 [1:01:54<26:05, 35.59s/it]2019-12-31 00:27:15.492 | INFO     | __main__:search_tweet_from_query:82 - query = Pak-China Technical and Vocational Institute


到头了


 60%|██████    | 65/108 [1:02:31<25:41, 35.85s/it]2019-12-31 00:27:51.942 | INFO     | __main__:search_tweet_from_query:82 - query = Pakistan Port Qasim Power Project


到头了


 61%|██████    | 66/108 [1:02:59<23:32, 33.63s/it]2019-12-31 00:28:20.385 | INFO     | __main__:search_tweet_from_query:82 - query = Quaid-e-Azam Solar Park


到头了


 62%|██████▏   | 67/108 [1:03:37<23:53, 34.97s/it]2019-12-31 00:28:58.503 | INFO     | __main__:search_tweet_from_query:82 - query = Karakoram Highway


到头了


 63%|██████▎   | 68/108 [1:06:12<47:19, 70.99s/it]2019-12-31 00:31:33.515 | INFO     | __main__:search_tweet_from_query:82 - query = Sahiwal Coal Power Project


到头了


 64%|██████▍   | 69/108 [1:06:48<39:11, 60.30s/it]2019-12-31 00:32:08.896 | INFO     | __main__:search_tweet_from_query:82 - query = Suki Kinari Hydropower Project


到头了


 65%|██████▍   | 70/108 [1:07:18<32:29, 51.31s/it]2019-12-31 00:32:39.206 | INFO     | __main__:search_tweet_from_query:82 - query = Nigcomsat Satellites


到头了


 66%|██████▌   | 71/108 [1:07:40<26:12, 42.50s/it]2019-12-31 00:33:01.143 | INFO     | __main__:search_tweet_from_query:82 - query = MNC Lido City


到头了


 67%|██████▋   | 72/108 [1:08:08<22:50, 38.06s/it]2019-12-31 00:33:28.853 | INFO     | __main__:search_tweet_from_query:82 - query = Harare Airport Expansion


到头了


 68%|██████▊   | 73/108 [1:08:29<19:19, 33.13s/it]2019-12-31 00:33:50.478 | INFO     | __main__:search_tweet_from_query:82 - query = Gilgit KIU Hydropower


到头了


 69%|██████▊   | 74/108 [1:08:43<15:29, 27.34s/it]2019-12-31 00:34:04.313 | INFO     | __main__:search_tweet_from_query:82 - query = Cacho 50MW Wind Power Project


到头了


 69%|██████▉   | 75/108 [1:08:57<12:47, 23.26s/it]2019-12-31 00:34:18.043 | INFO     | __main__:search_tweet_from_query:82 - query = Rahimyar Khan Power Plant


到头了


 70%|███████   | 76/108 [1:09:11<10:51, 20.37s/it]2019-12-31 00:34:31.666 | INFO     | __main__:search_tweet_from_query:82 - query = Kohala Hydel Project


到头了


 71%|███████▏  | 77/108 [1:09:33<10:48, 20.93s/it]2019-12-31 00:34:53.911 | INFO     | __main__:search_tweet_from_query:82 - query = Phandar Hydropower Station


到头了


 72%|███████▏  | 78/108 [1:09:46<09:17, 18.57s/it]2019-12-31 00:35:06.985 | INFO     | __main__:search_tweet_from_query:82 - query = Karachi Circular Railway


到头了


 73%|███████▎  | 79/108 [1:10:44<14:42, 30.44s/it]2019-12-31 00:36:05.112 | INFO     | __main__:search_tweet_from_query:82 - query = Greater Peshawar Mass Transit


到头了


 74%|███████▍  | 80/108 [1:11:03<12:38, 27.11s/it]2019-12-31 00:36:24.438 | INFO     | __main__:search_tweet_from_query:82 - query = Quetta Mass Transit


到头了


 75%|███████▌  | 81/108 [1:11:36<12:55, 28.73s/it]2019-12-31 00:36:56.973 | INFO     | __main__:search_tweet_from_query:82 - query = Keti BUnder Sea Port Project


到头了


 76%|███████▌  | 82/108 [1:11:49<10:28, 24.16s/it]2019-12-31 00:37:10.453 | INFO     | __main__:search_tweet_from_query:82 - query = Rashakai Economic Zone


到头了


 77%|███████▋  | 83/108 [1:12:20<10:51, 26.05s/it]2019-12-31 00:37:40.903 | INFO     | __main__:search_tweet_from_query:82 - query = China Special Economic Zone Dhabeji


到头了


 78%|███████▊  | 84/108 [1:12:39<09:35, 23.97s/it]2019-12-31 00:38:00.018 | INFO     | __main__:search_tweet_from_query:82 - query = Bostan Industrial Zone


到头了


 79%|███████▊  | 85/108 [1:13:00<08:50, 23.07s/it]2019-12-31 00:38:20.984 | INFO     | __main__:search_tweet_from_query:82 - query = Allama Iqbal Industrial City


到头了


 80%|███████▉  | 86/108 [1:13:29<09:10, 25.03s/it]2019-12-31 00:38:50.581 | INFO     | __main__:search_tweet_from_query:82 - query = ICT Model Industrial Zone


到头了


 81%|████████  | 87/108 [1:13:43<07:31, 21.48s/it]2019-12-31 00:39:03.791 | INFO     | __main__:search_tweet_from_query:82 - query = Mirpur Special Economic Zone


到头了


 81%|████████▏ | 88/108 [1:14:04<07:09, 21.49s/it]2019-12-31 00:39:25.287 | INFO     | __main__:search_tweet_from_query:82 - query = Mohmand Marble City


到头了


 82%|████████▏ | 89/108 [1:14:45<08:35, 27.14s/it]2019-12-31 00:40:05.612 | INFO     | __main__:search_tweet_from_query:82 - query = Moqpondass Special Economic Zone


到头了


 83%|████████▎ | 90/108 [1:14:57<06:51, 22.84s/it]2019-12-31 00:40:18.431 | INFO     | __main__:search_tweet_from_query:82 - query = Haifa Port


到头了


 84%|████████▍ | 91/108 [1:15:18<06:14, 22.04s/it]2019-12-31 00:40:38.601 | INFO     | __main__:search_tweet_from_query:82 - query = Port of Piraeus


到头了


 85%|████████▌ | 92/108 [1:16:33<10:08, 38.05s/it]2019-12-31 00:41:54.024 | INFO     | __main__:search_tweet_from_query:82 - query = Kumport Terminal


到头了


 86%|████████▌ | 93/108 [1:16:55<08:18, 33.21s/it]2019-12-31 00:42:15.932 | INFO     | __main__:search_tweet_from_query:82 - query = Suez Canal Economic Zone


到头了


 87%|████████▋ | 94/108 [1:17:36<08:17, 35.50s/it]2019-12-31 00:42:56.782 | INFO     | __main__:search_tweet_from_query:82 - query = Kyaukpyu Deep Sea Tanker Port


到头了


 88%|████████▊ | 95/108 [1:19:20<12:08, 56.00s/it]2019-12-31 00:44:40.620 | INFO     | __main__:search_tweet_from_query:82 - query = Kyaukpyu Special Economic Zone


TimeoutException


 89%|████████▉ | 96/108 [1:19:39<08:59, 44.96s/it]2019-12-31 00:44:59.807 | INFO     | __main__:search_tweet_from_query:82 - query = Port Aktau


到头了


 90%|████████▉ | 97/108 [1:20:04<07:09, 39.03s/it]2019-12-31 00:45:25.015 | INFO     | __main__:search_tweet_from_query:82 - query = “Khorgos – Eastern Gate”


到头了


 91%|█████████ | 98/108 [1:20:23<05:29, 32.93s/it]2019-12-31 00:45:43.701 | INFO     | __main__:search_tweet_from_query:82 - query = Khalifa Port Terminal 2


到头了


 92%|█████████▏| 99/108 [1:20:43<04:22, 29.20s/it]2019-12-31 00:46:04.213 | INFO     | __main__:search_tweet_from_query:82 - query = Greater Peshawar Region Mass Transit


到头了


 93%|█████████▎| 100/108 [1:20:58<03:19, 24.88s/it]2019-12-31 00:46:19.018 | INFO     | __main__:search_tweet_from_query:82 - query = Dhaka-Chattogram Rail Route


到头了


 94%|█████████▎| 101/108 [1:21:14<02:35, 22.15s/it]2019-12-31 00:46:34.773 | INFO     | __main__:search_tweet_from_query:82 - query = Kuala Tanjung Port


到头了


 94%|█████████▍| 102/108 [1:21:39<02:18, 23.08s/it]2019-12-31 00:47:00.042 | INFO     | __main__:search_tweet_from_query:82 - query = Kayan River Hydropower Plant


到头了


 95%|█████████▌| 103/108 [1:21:52<01:40, 20.19s/it]2019-12-31 00:47:13.485 | INFO     | __main__:search_tweet_from_query:82 - query = Lake Toba Tourism District


到头了


 96%|█████████▋| 104/108 [1:22:05<01:11, 17.90s/it]2019-12-31 00:47:26.037 | INFO     | __main__:search_tweet_from_query:82 - query = International Airport Lembeh


到头了


 97%|█████████▋| 105/108 [1:22:20<00:50, 16.91s/it]2019-12-31 00:47:40.638 | INFO     | __main__:search_tweet_from_query:82 - query = Dammam Riyadh Freight Line


到头了


 98%|█████████▊| 106/108 [1:22:37<00:34, 17.17s/it]2019-12-31 00:47:58.419 | INFO     | __main__:search_tweet_from_query:82 - query = Hassyan Clean Coal Project


到头了


 99%|█████████▉| 107/108 [1:23:02<00:19, 19.56s/it]2019-12-31 00:48:23.542 | INFO     | __main__:search_tweet_from_query:82 - query = Muse-Mandalay Railway


到头了


100%|██████████| 108/108 [1:23:31<00:00, 46.40s/it]

到头了
[]





In [25]:
len(finish_query_list)

2

In [18]:
browser.refresh()
time.sleep(4)

##### 采集用户

In [22]:
finish_query_list = []
bad_query_list = []
MAX_USER_SIZE = 50
special_list = [
    "belt and road",
    "One Belt one road",
    'Belt and Road Initiative'
]
search_user_from_query(browser,special_list,finish_query_list)


  0%|          | 0/2 [00:00<?, ?it/s][A2019-12-31 10:34:58.260 | INFO     | __main__:search_user_from_query:6 - query = One Belt one road


parsing uid = beltandroadnow
parsing uid = BirKusakBirYol
parsing uid = 1GadgetBoy
parsing uid = TheBeltandRoad
parsing uid = OBORVC
parsing uid = OneBeltOneRoad_
parsing uid = OneBelt_OneRoad
parsing uid = obormalaysia
parsing uid = SilkRoadLAC
parsing uid = OneBeltProject
parsing uid = VasilGelev
parsing uid = beltandroadnow
parsing uid = BirKusakBirYol
parsing uid = 1GadgetBoy
parsing uid = TheBeltandRoad
parsing uid = OBORVC
parsing uid = OneBeltOneRoad_
parsing uid = OneBelt_OneRoad
parsing uid = obormalaysia
parsing uid = SilkRoadLAC
parsing uid = OneBeltProject
parsing uid = VasilGelev



 50%|█████     | 1/2 [00:14<00:14, 14.90s/it][A2019-12-31 10:35:13.159 | INFO     | __main__:search_user_from_query:6 - query = Belt and Road Initiative


到头了
parsing uid = BeltNRoad
parsing uid = Belt_Road_China
parsing uid = beltroadnews
parsing uid = CPECWire
parsing uid = TorrinWilkins
parsing uid = khaleefah30
parsing uid = BRItv_Plus
parsing uid = TJMa_beijing
parsing uid = VasilGelev
parsing uid = jeremy_garlick
parsing uid = jordiyang_srcic
parsing uid = BeltandRoadBRC
parsing uid = BeltNRoad
parsing uid = Belt_Road_China
parsing uid = beltroadnews
parsing uid = CPECWire
parsing uid = TorrinWilkins
parsing uid = khaleefah30
parsing uid = BRItv_Plus
parsing uid = TJMa_beijing
parsing uid = VasilGelev
parsing uid = jeremy_garlick
parsing uid = jordiyang_srcic
parsing uid = BeltandRoadBRC
parsing uid = OBOReurope
parsing uid = eyckfreymann



100%|██████████| 2/2 [00:29<00:00, 14.68s/it][A

到头了
[]





##### 采集用户主页推文

In [None]:
finish_user_list = []
bad_query_list = []

query_user_list = [u for u in user_table.find() if u['query']!='']# query 不为空的user

print(len(query_user_list))

MAX_TWEET_SIZE = 1000
time_interval = "2019年1月1日" # 默认截止到今日
search_tweet_from_profile_v2(browser,query_user_list,finish_user_list)


  0%|          | 0/41 [00:00<?, ?it/s][A2019-12-31 10:41:00.348 | INFO     | __main__:search_tweet_from_profile_v2:3 - user = ErikSolheim


41


41
