In [42]:
import re
import time
import datetime
import pymongo
from tqdm import tqdm
import copy
from loguru import logger

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException,TimeoutException

In [None]:
class Tweet:

    def __init__(self, query, uid, ptime, pcontent, padditional, nb_reply, nb_retweet, nb_favorite):
        self.query = query
        self.uid = uid
        self.ptime = ptime
        self.pcontent = pcontent
        self.padditional = padditional  # 转发推文，文章链接，图片，视频
        self.nb_retweet = nb_retweet  # nbr of retweet
        self.nb_favorite = nb_favorite  # nbr of favorite
        self.nb_reply = nb_reply    # nbr of reply

    def __repr__(self):
        return "Tweet={}\nQuery={}".format(self.pcontent, self.query)


class User:

    def __init__(self, profile_url):
        self.profile_url = profile_url
        self.ID = profile_url.split('/')[-1]
        self.name = ''
        self.avatar = ''
        self.query = ''# query相关的大V用户
        self.intro = ''
        

    def __repr__(self):
        return "User {}".format(self.ID)


def compare_time(time1,time2):
    s_time = time.mktime(time.strptime(time1,'%Y年%m月%d日'))
    e_time = time.mktime(time.strptime(time2,'%Y年%m月%d日'))
    return int(s_time) - int(e_time)
    
def convert_time(x):
    '''
    for x in ['20分钟','1小时','1天', '10月10日','2018年10月1日']:
        print(convert_time(x))
    '''
    now = datetime.datetime.now()
    pattern = r'\d{4}年\d+月\d+日'
    if re.match(pattern, x):
        return x
    pattern = r'\d+月\d+日'
    if re.match(pattern, x):
        return "{}年".format(now.year)+x
    return "{}年{}月{}日".format(now.year, now.month, now.day)

def is_non_result(browser):
    '''
    判断结果是否为空
    '''
#     result_div_xpath = "//div[@id='react-root']"
#     wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
#     try:
#         result_div = browser.find_element_by_xpath(result_div_xpath)
#         return "未找到结果" in result_div.text
#     except NoSuchElementException as e:
    return "未找到结果" in browser.find_element_by_tag_name('body').text

def get_search_input_v1(browser):
    # 定位搜索框
    search_input_xpath = "//input[@placeholder='搜索 Twitter']"
    wait.until(EC.presence_of_element_located((By.XPATH, search_input_xpath)))
    search_input = browser.find_element_by_xpath(search_input_xpath)
    return search_input

# def get_search_input_v2(browser):
#     # 请求主站
#     browser.get('https://twitter.com/search-home')
#     # 定位搜索框
#     search_input_id = 'search-home-input'
#     wait.until(EC.presence_of_element_located((By.ID, search_input_id)))
#     search_input = browser.find_element_by_id(search_input_id)
#     return search_input


def extract_reply_retweet_favorite(element):
    t = []
    for x in element.find_elements_by_xpath('./div')[:3]:
        if x.text.strip() == '':
            t.append(0)
        else:
            t.append(int(x.text.strip()))
    return tuple(t)


#### query -> 推文爬取

In [55]:
def parse_tweet_result_div(result_div,query):
    count = 0
    for div in result_div:
        user, tweet = div.find_elements_by_xpath('./div')
        profile_url = user.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        a, *b_c, d = tweet.find_elements_by_xpath('./div')  # 按照div分为>=3层
        ptime = a.find_elements_by_tag_name('a')[-1].text
        ptime = convert_time(ptime)
        nb_reply, nb_retweet, nb_favorite = 0,0,0
        try:
            nb_reply, nb_retweet, nb_favorite = extract_reply_retweet_favorite(
                d)
        except:
            nb_reply, nb_retweet, nb_favorite = 0, 0, 0
        pcontent = b_c[0].text
        padditional = []
        if len(b_c) > 1:
            for x in b_c[1:]:
                try:
                    a = x.find_element_by_tag_name('a').get_attribute('href')
                    padditional.append(a)
                except NoSuchElementException as e:
                    padditional.append(x.text.strip())
        user = User(profile_url)
        tweet = Tweet(query, uid, ptime, pcontent, padditional,
                      nb_reply, nb_retweet, nb_favorite)
        # save to databse
        user_dict = user.__dict__
        user_dict['_id']=user_dict['ID']
        if user_table.update_one({'_id': user_dict['_id']}, {'$set': user_dict}, upsert=True) and tweet_table.insert_one(tweet.__dict__):
            count += 1
    return count

def crawl_tweet(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="tweet"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    # 解析结果
    count += parse_tweet_result_div(result_div,query)
    while count < MAX_TWEET_SIZE:
#         logger.info("{}/{}".format(count,MAX_TWEET_SIZE))
        result_div_xpath = '//div[@data-testid="tweet"]'
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        try:
            count += parse_tweet_result_div(result_div,query)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        try_times = 0
        old_height = browser.execute_script("return document.body.scrollHeight;")
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break
            time.sleep(3)
            new_height = browser.execute_script("return document.body.scrollHeight;")
            if old_height == new_height:
                try_times += 1
            if try_times >= 3:
                count = MAX_TWEET_SIZE # 到头了停止翻页采集该query
                print('到头了')
                break

def search_tweet_from_query(browser,query_list,finish_query_list):
    '''
    更加query采集推文
    '''
    for query in tqdm(query_list):
        logger.info('query = {}'.format(query))
        browser.get('https://twitter.com/explore')

        # 定位搜索框
        if browser.current_url == 'https://twitter.com/explore':
            search_input = get_search_input_v1(browser)
        else:
            print('error')
            return
        # 搜索query
        search_input.clear()
        search_input.send_keys(query)
        search_input.send_keys(Keys.ENTER)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(1)
        try:
            crawl_tweet(browser, query)
        except TimeoutException as e:
            print('TimeoutException')
            continue
        finish_query_list.append(query)
    print(bad_query_list)


#### query -> 爬取相关用户

In [63]:
def search_user_from_query(browser,query_list,finish_query_list):
    '''
    根据query采集maxsize用户列表
    '''
    for query in tqdm(query_list):
        logger.info('query = {}'.format(query))
        browser.get('https://twitter.com/explore')

        # 定位搜索框
        if browser.current_url == 'https://twitter.com/explore':
            search_input = get_search_input_v1(browser)
        else:
            print('error')
            return
        # 搜索query
        search_input.clear()
        search_input.send_keys(query)
        search_input.send_keys(Keys.ENTER)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(2)
        # 请求用户结果页面
        browser.get(browser.current_url + '&f=user')
        
        try:
            crawl_user(browser,query)
        except TimeoutException as e:
            print('TimeoutException')
            continue
        finish_query_list.append(query)
    
    print(bad_query_list)

def parse_user_result_div(result_div,query):
    count = 0
    for t in result_div:
        left, right = t.find_elements_by_xpath('./div/div')
        profile_url = left.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        print('parsing uid = {}'.format(uid))
        avatar = left.find_element_by_tag_name('img').get_attribute('src')
        a, *b = right.find_elements_by_xpath('./div')  # 按照div分为>=3层
        uname= a.text.split('\n')[0]
        intro = ''
        if len(b)!=0:
            intro = b[-1].text.strip()
        
        user = User(profile_url)
        user.ID = uid
        user.avatar = avatar
        user.name = uname
        user.intro = intro
        user.query = query
        
        user_dict = user.__dict__
        user_dict['_id']=user_dict['ID']

        # save to databse
        if user_table.update_one({'_id': user_dict['_id']}, {'$set': user_dict}, upsert=True):
            count += 1
    return count

def crawl_user(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="UserCell"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    count += parse_user_result_div(result_div,query)
    
    while count < MAX_USER_SIZE:
#         logger.info("{}/{}".format(count,MAX_USER_SIZE))
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        
        try:
            count += parse_user_result_div(result_div,query)
            time.sleep(2)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        try_times = 0
        old_height = browser.execute_script("return document.body.scrollHeight;")
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break
            time.sleep(3)
            new_height = browser.execute_script("return document.body.scrollHeight;")
            if old_height == new_height:
                try_times += 1
            if try_times >= 3:
                count = MAX_TWEET_SIZE # 到头了停止翻页采集该query
                print('到头了')
                break

#### 爬取特定用户的推文（时间间隔内）

In [64]:
def search_tweet_from_profile(browser,query_user_list,finish_user_list):
    for user in tqdm(query_user_list):
        logger.info('user = {}'.format(u['_id']))
        user_profile = 'https://twitter.com/'+ u['_id']
        browser.get(user_profile)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(1)
        try:
            crawl_tweet2(browser,query)
        except TimeoutException as e:
            print('TimeoutException')
            continue
        
        finish_user_list.append(user)
    print(bad_query_list)

def crawl_tweet2(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="tweet"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    # 解析结果
    count += parse_tweet_from_profile(result_div,query)
    while count < MAX_TWEET_SIZE:
#         logger.info("{}/{}".format(count,MAX_TWEET_SIZE))
        result_div_xpath = '//div[@data-testid="tweet"]'
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        try:
            count += parse_tweet_from_profile(result_div,query)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        try_times = 0
        old_height = browser.execute_script("return document.body.scrollHeight;")
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break
            time.sleep(3)
            new_height = browser.execute_script("return document.body.scrollHeight;")
            if old_height == new_height:
                try_times += 1
            if try_times >= 3:
                count = MAX_TWEET_SIZE # 到头了停止翻页采集该query
                print('到头了')
                break

                
def parse_tweet_from_profile(result_div,query):
    count = 0
    top = 0 # 置顶是个数目
    # 如果存在置顶推文则不考虑时间
    try:
        t = browser.find_elements_by_xpath('//div[@class="css-1dbjc4n r-1habvwh r-1iusvr4 r-16y2uox r-5f2r5o"]')
        top = len(t)
    except NoSuchElementException as e:
        pass
    
    for div in result_div:
        user, tweet = div.find_elements_by_xpath('./div')
        profile_url = user.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        a, *b_c, d = tweet.find_elements_by_xpath('./div')  # 按照div分为>=3层
        ptime = a.find_elements_by_tag_name('a')[-1].text
        ptime = convert_time(ptime)
        
        # 无置顶推文则按照时间过滤
        top -= 1
        if top<0 and compare_time(ptime,time_interval) < 0:
            print('触发时间截止')
            return MAX_TWEET_SIZE # 使其直接达到数目规模，从而停止外层循环
        
        nb_reply, nb_retweet, nb_favorite = 0,0,0
        try:
            nb_reply, nb_retweet, nb_favorite = extract_reply_retweet_favorite(
                d)
        except:
            nb_reply, nb_retweet, nb_favorite = 0, 0, 0
        pcontent = b_c[0].text
        padditional = []
        if len(b_c) > 1:
            for x in b_c[1:]:
                try:
                    a = x.find_element_by_tag_name('a').get_attribute('href')
                    padditional.append(a)
                except NoSuchElementException as e:
                    padditional.append(x.text.strip())
        tweet = Tweet(query, uid, ptime, pcontent, padditional,
                      nb_reply, nb_retweet, nb_favorite)

        if tweet_table.insert_one(tweet.__dict__):
            count += 1
    return count

#### 启动浏览器并登陆

In [35]:
client = pymongo.MongoClient("mongodb://10.108.17.25:27017/")
twitter_db = client["twitter"]
user_table = twitter_db['user']
tweet_table = twitter_db['tweet']

# 打开浏览器
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 100)

# 人工登录
browser.get('https://twitter.com/')

In [65]:
browser.refresh()
time.sleep(2)

##### 采集推文

In [61]:
import pandas as pd

df = pd.read_csv('./projects.csv',encoding='utf-8')
df.columns = ['Project','Country','Type']
query_list = [x.strip() for x in df['Project'].tolist() if len(x.split()) <= 5]
query_list = query_list

# df = pd.read_csv('./policies.csv',encoding='utf-8')
# df.columns = ['P','_','__']
# query_list = [x.strip() for x in df['P'].tolist() if len(x.split()) <= 10]
# query_list = query_list
len(query_list)

108

In [None]:
# finish_query_list = []
# bad_query_list = []
# MAX_TWEET_SIZE = 1000
# search_tweet_from_query(browser,query_list,finish_query_list)

MAX_TWEET_SIZE = 1000
special_list = ["the belt and road",'One Belt one road',"the Silk Road",'the Silk Road Economic Belt']
search_tweet_from_query(browser,special_list[1:],finish_query_list)

In [62]:
len(finish_query_list)

11

##### 采集用户

In [None]:
finish_query_list = []
bad_query_list = []
MAX_USER_SIZE = 20
search_user_from_query(browser,query_list,finish_query_list)







  0%|          | 0/108 [00:00<?, ?it/s][A[A[A[A[A[A2019-12-21 12:27:27.368 | INFO     | __main__:search_user_from_query:6 - query = Padma Rail Link






  1%|          | 1/108 [01:43<3:05:05, 103.79s/it][A[A[A[A[A[A2019-12-21 12:29:11.156 | INFO     | __main__:search_user_from_query:6 - query = Lower Sesan Two Hydropower Dam


TimeoutException








  2%|▏         | 2/108 [03:27<3:03:30, 103.87s/it][A[A[A[A[A[A2019-12-21 12:30:55.225 | INFO     | __main__:search_user_from_query:6 - query = Central Asia–China gas pipeline


TimeoutException








  3%|▎         | 3/108 [05:12<3:02:05, 104.05s/it][A[A[A[A[A[A2019-12-21 12:32:39.690 | INFO     | __main__:search_user_from_query:6 - query = Doraleh Multi-Purpose Port


TimeoutException
parsing uid = DoralehS
parsing uid = DoralehS








  4%|▎         | 4/108 [05:29<2:15:12, 78.01s/it] [A[A[A[A[A[A2019-12-21 12:32:56.931 | INFO     | __main__:search_user_from_query:6 - query = Khorgos Gateway Dry Port


到头了








  5%|▍         | 5/108 [07:13<2:27:29, 85.92s/it][A[A[A[A[A[A2019-12-21 12:34:41.306 | INFO     | __main__:search_user_from_query:6 - query = Forest City


TimeoutException
parsing uid = knm0q0
parsing uid = FCRSD
parsing uid = FCIndianFB
parsing uid = ForestCityTDC
parsing uid = ForestCityIA
parsing uid = ForestCityPD
parsing uid = ForestCityFooty
parsing uid = ForestCtComicon
parsing uid = forestcityvelo
parsing uid = ForestCityDerby
parsing uid = LakeForestCA
parsing uid = ForestCityOwls
parsing uid = FCfilmfestival
parsing uid = knm0q0
parsing uid = FCRSD
parsing uid = FCIndianFB
parsing uid = ForestCityTDC
parsing uid = ForestCityIA
parsing uid = ForestCityPD
parsing uid = ForestCityFooty
parsing uid = ForestCtComicon
parsing uid = forestcityvelo
parsing uid = ForestCityDerby
parsing uid = LakeForestCA
parsing uid = ForestCityOwls
parsing uid = FCfilmfestival
parsing uid = ForestCityLovrs
parsing uid = EppingForestCol
parsing uid = ForestCityRaces
parsing uid = OurCityForest
parsing uid = FCNGolf
parsing uid = ForestCityCook
parsing uid = CLE_Jared
parsing uid = FC_Summit
parsing uid = ForestLake_MN
parsing uid = ForestCityAFC
parsin







  6%|▌         | 6/108 [07:30<1:50:33, 65.04s/it][A[A[A[A[A[A2019-12-21 12:34:57.624 | INFO     | __main__:search_user_from_query:6 - query = Melaka Gateway


parsing uid = GatewayMelaka
parsing uid = jelitasara
parsing uid = MelakaGateway
parsing uid = GatewayMelaka
parsing uid = jelitasara
parsing uid = MelakaGateway








  6%|▋         | 7/108 [07:50<1:26:53, 51.61s/it][A[A[A[A[A[A2019-12-21 12:35:17.918 | INFO     | __main__:search_user_from_query:6 - query = Pakistan-China – Fiber Optic Project


到头了








  7%|▋         | 8/108 [09:35<1:52:51, 67.71s/it][A[A[A[A[A[A2019-12-21 12:37:03.194 | INFO     | __main__:search_user_from_query:6 - query = Diamer-Bhasha Dam


TimeoutException








  8%|▊         | 9/108 [11:19<2:09:45, 78.65s/it][A[A[A[A[A[A2019-12-21 12:38:47.350 | INFO     | __main__:search_user_from_query:6 - query = Gwadar Port


TimeoutException
parsing uid = port_gwadar
parsing uid = Gwadarpost
parsing uid = GwadarPort1
parsing uid = gwadar_port
parsing uid = GwadarPortCity_
parsing uid = gwadarport
parsing uid = gwadar_portcity
parsing uid = GwadarFreePort
parsing uid = GwadarPortCity
parsing uid = Gwadar_Sea_Port
parsing uid = GWADAR_LOGISTIC
parsing uid = gwadarcentral
parsing uid = gwadarport13
parsing uid = PortGwadar
parsing uid = Gwadarport_
parsing uid = port_gwadar
parsing uid = Gwadarpost
parsing uid = GwadarPort1
parsing uid = gwadar_port
parsing uid = GwadarPortCity_
parsing uid = gwadarport
parsing uid = gwadar_portcity
parsing uid = GwadarFreePort
parsing uid = GwadarPortCity
parsing uid = Gwadar_Sea_Port
parsing uid = GWADAR_LOGISTIC
parsing uid = gwadarcentral
parsing uid = gwadarport13
parsing uid = PortGwadar
parsing uid = Gwadarport_
parsing uid = GovGwadar
parsing uid = Gwadarport2030
parsing uid = SaiyedShahzad








  9%|▉         | 10/108 [11:38<1:39:07, 60.69s/it][A[A[A[A[A[A2019-12-21 12:39:06.151 | INFO     | __main__:search_user_from_query:6 - query = Belgrade-Montenegro Bar Port Motorway


到头了
parsing uid = grad_beograd
parsing uid = beogradEU
parsing uid = grad_beograd
parsing uid = beogradEU








 10%|█         | 11/108 [11:55<1:16:53, 47.56s/it][A[A[A[A[A[A2019-12-21 12:39:23.057 | INFO     | __main__:search_user_from_query:6 - query = Sino-Thai – High-Speed Railway


到头了








 11%|█         | 12/108 [13:39<1:43:06, 64.45s/it][A[A[A[A[A[A2019-12-21 12:41:06.917 | INFO     | __main__:search_user_from_query:6 - query = Colombo South Harbour


TimeoutException








 12%|█▏        | 13/108 [15:23<2:00:52, 76.34s/it][A[A[A[A[A[A2019-12-21 12:42:51.022 | INFO     | __main__:search_user_from_query:6 - query = Port City Colombo


TimeoutException
parsing uid = PortCityColombo
parsing uid = Kassapas
parsing uid = ColomboPortCity
parsing uid = PortCityColombo
parsing uid = Kassapas
parsing uid = ColomboPortCity








 13%|█▎        | 14/108 [15:41<1:31:55, 58.67s/it][A[A[A[A[A[A2019-12-21 12:43:08.452 | INFO     | __main__:search_user_from_query:6 - query = Hambantota Port


到头了








 14%|█▍        | 15/108 [17:25<1:52:15, 72.43s/it][A[A[A[A[A[A2019-12-21 12:44:52.987 | INFO     | __main__:search_user_from_query:6 - query = Single Gauge Trans-Asian Railway


TimeoutException








 15%|█▍        | 16/108 [19:09<2:05:32, 81.88s/it][A[A[A[A[A[A2019-12-21 12:46:36.902 | INFO     | __main__:search_user_from_query:6 - query = Karuma Hydropower Project


TimeoutException








 16%|█▌        | 17/108 [20:53<2:14:11, 88.48s/it][A[A[A[A[A[A2019-12-21 12:48:20.778 | INFO     | __main__:search_user_from_query:6 - query = Pap Angren Railway


TimeoutException








 17%|█▋        | 18/108 [22:37<2:19:38, 93.09s/it][A[A[A[A[A[A2019-12-21 12:50:04.647 | INFO     | __main__:search_user_from_query:6 - query = Budapest–Belgrade Railway


TimeoutException
parsing uid = grad_beograd
parsing uid = beogradEU
parsing uid = grad_beograd
parsing uid = beogradEU








 18%|█▊        | 19/108 [22:53<1:43:59, 70.10s/it][A[A[A[A[A[A2019-12-21 12:50:21.110 | INFO     | __main__:search_user_from_query:6 - query = Yamal LNG Project


到头了








 19%|█▊        | 20/108 [24:38<1:58:05, 80.52s/it][A[A[A[A[A[A2019-12-21 12:52:05.933 | INFO     | __main__:search_user_from_query:6 - query = Tehran-Mashhad Railway


TimeoutException








 19%|█▉        | 21/108 [26:22<2:07:05, 87.65s/it][A[A[A[A[A[A2019-12-21 12:53:50.212 | INFO     | __main__:search_user_from_query:6 - query = Lagos-Calabar Railway


TimeoutException








 20%|██        | 22/108 [28:07<2:12:51, 92.69s/it][A[A[A[A[A[A2019-12-21 12:55:34.656 | INFO     | __main__:search_user_from_query:6 - query = Lagos-Kano Railway


TimeoutException








 21%|██▏       | 23/108 [29:51<2:16:15, 96.18s/it][A[A[A[A[A[A2019-12-21 12:57:18.981 | INFO     | __main__:search_user_from_query:6 - query = Chad-Cameroon & Chad-Sudan Railway


TimeoutException








 22%|██▏       | 24/108 [31:36<2:18:14, 98.75s/it][A[A[A[A[A[A2019-12-21 12:59:03.716 | INFO     | __main__:search_user_from_query:6 - query = Addis Ababa Light Rail


TimeoutException
parsing uid = LightRailAddis
parsing uid = ercaalrts
parsing uid = LightRailAddis
parsing uid = ercaalrts








 23%|██▎       | 25/108 [31:52<1:42:26, 74.05s/it][A[A[A[A[A[A2019-12-21 12:59:20.139 | INFO     | __main__:search_user_from_query:6 - query = Benguela Railway


到头了








 24%|██▍       | 26/108 [33:36<1:53:24, 82.98s/it][A[A[A[A[A[A2019-12-21 13:01:03.957 | INFO     | __main__:search_user_from_query:6 - query = Abuja-Kaduna Railway


TimeoutException








 25%|██▌       | 27/108 [35:20<2:00:36, 89.34s/it][A[A[A[A[A[A2019-12-21 13:02:48.138 | INFO     | __main__:search_user_from_query:6 - query = Khartoum-Port Sudan Railway


TimeoutException








 26%|██▌       | 28/108 [37:04<2:04:58, 93.74s/it][A[A[A[A[A[A2019-12-21 13:04:32.135 | INFO     | __main__:search_user_from_query:6 - query = Djibouti-Ethiopia Railway


TimeoutException








 27%|██▋       | 29/108 [38:48<2:07:30, 96.84s/it][A[A[A[A[A[A2019-12-21 13:06:16.210 | INFO     | __main__:search_user_from_query:6 - query = Vientane-Boten Railway


TimeoutException








 28%|██▊       | 30/108 [40:32<2:08:40, 98.98s/it][A[A[A[A[A[A2019-12-21 13:08:00.180 | INFO     | __main__:search_user_from_query:6 - query = Savannakhet-Lao Bao Railway


TimeoutException








 29%|██▊       | 31/108 [42:16<2:08:55, 100.46s/it][A[A[A[A[A[A2019-12-21 13:09:44.100 | INFO     | __main__:search_user_from_query:6 - query = Bangkok-Nong Khai Railway


TimeoutException
parsing uid = JeffMcClung
parsing uid = Air1Lauren
parsing uid = ThisIsJLiv
parsing uid = iamedee
parsing uid = JCraig_Dallas
parsing uid = bjacksondigital
parsing uid = MarriedMornings
parsing uid = air1radio
parsing uid = JeffMcClung
parsing uid = Air1Lauren
parsing uid = ThisIsJLiv
parsing uid = iamedee
parsing uid = JCraig_Dallas
parsing uid = bjacksondigital
parsing uid = MarriedMornings
parsing uid = air1radio








 30%|██▉       | 32/108 [42:34<1:35:47, 75.63s/it] [A[A[A[A[A[A2019-12-21 13:10:01.786 | INFO     | __main__:search_user_from_query:6 - query = Bangkok-Chiang Mai Railway


到头了








 31%|███       | 33/108 [44:18<1:45:12, 84.17s/it][A[A[A[A[A[A2019-12-21 13:11:45.888 | INFO     | __main__:search_user_from_query:6 - query = Kuala Lumpur-Singapore High Speed Rail


TimeoutException








 31%|███▏      | 34/108 [46:02<1:51:14, 90.20s/it][A[A[A[A[A[A2019-12-21 13:13:30.160 | INFO     | __main__:search_user_from_query:6 - query = Jakarta-Bandung Railway


TimeoutException
parsing uid = jakarta52325993
parsing uid = info_DKI
parsing uid = kelkebonwaru
parsing uid = jakartagoid
parsing uid = PemkotBandung
parsing uid = jakarta52325993
parsing uid = info_DKI
parsing uid = kelkebonwaru
parsing uid = jakartagoid
parsing uid = PemkotBandung








 32%|███▏      | 35/108 [46:19<1:23:00, 68.23s/it][A[A[A[A[A[A2019-12-21 13:13:47.109 | INFO     | __main__:search_user_from_query:6 - query = East Coast Railway


到头了
parsing uid = eastcoastrail
parsing uid = das_nirakar1963
parsing uid = EastShramik
parsing uid = Trainhelp1
parsing uid = ECoastRailway
parsing uid = rpfecor1
parsing uid = jpmrail
parsing uid = National_Rail
parsing uid = eastcoastrail
parsing uid = das_nirakar1963
parsing uid = EastShramik
parsing uid = Trainhelp1
parsing uid = ECoastRailway
parsing uid = rpfecor1
parsing uid = jpmrail
parsing uid = National_Rail








 33%|███▎      | 36/108 [46:37<1:03:36, 53.01s/it][A[A[A[A[A[A2019-12-21 13:14:04.606 | INFO     | __main__:search_user_from_query:6 - query = Gemas-Johor Bahru Railway


到头了








 34%|███▍      | 37/108 [48:21<1:20:48, 68.29s/it][A[A[A[A[A[A2019-12-21 13:15:48.568 | INFO     | __main__:search_user_from_query:6 - query = Dawei Port


TimeoutException








 35%|███▌      | 38/108 [50:05<1:32:06, 78.95s/it][A[A[A[A[A[A2019-12-21 13:17:32.381 | INFO     | __main__:search_user_from_query:6 - query = Gujarat Rural Roads (MMGSY) Project


TimeoutException








 36%|███▌      | 39/108 [51:49<1:39:26, 86.47s/it][A[A[A[A[A[A2019-12-21 13:19:16.385 | INFO     | __main__:search_user_from_query:6 - query = Nurek Hydropower Rehabilitation Project


TimeoutException








 37%|███▋      | 40/108 [53:33<1:43:58, 91.74s/it][A[A[A[A[A[A2019-12-21 13:21:00.443 | INFO     | __main__:search_user_from_query:6 - query = Batumi Bypass Road Project


TimeoutException








 38%|███▊      | 41/108 [55:17<1:46:36, 95.47s/it][A[A[A[A[A[A2019-12-21 13:22:44.606 | INFO     | __main__:search_user_from_query:6 - query = Natural Gas Project


TimeoutException
parsing uid = barbarosdemir
parsing uid = PlattsGas
parsing uid = tanapofficial
parsing uid = NESupplyEnhance
parsing uid = Cameron_LNG
parsing uid = naturalgasENRG
parsing uid = AtlSunProject
parsing uid = PPONews
parsing uid = powerimpossible
parsing uid = barbarosdemir
parsing uid = PlattsGas
parsing uid = tanapofficial
parsing uid = NESupplyEnhance
parsing uid = Cameron_LNG
parsing uid = naturalgasENRG
parsing uid = AtlSunProject
parsing uid = PPONews
parsing uid = powerimpossible








 39%|███▉      | 42/108 [55:35<1:19:29, 72.26s/it][A[A[A[A[A[A2019-12-21 13:23:02.713 | INFO     | __main__:search_user_from_query:6 - query = Tarbela 5 Hydropower Extension Project


到头了








 40%|███▉      | 43/108 [57:19<1:28:34, 81.76s/it][A[A[A[A[A[A2019-12-21 13:24:46.634 | INFO     | __main__:search_user_from_query:6 - query = M4 Motorway


TimeoutException
parsing uid = TheM4Motorway
parsing uid = M4motorway
parsing uid = LlangyfelachPri
parsing uid = MadejskiHotel
parsing uid = westconnex
parsing uid = TheM4Motorway
parsing uid = M4motorway
parsing uid = LlangyfelachPri
parsing uid = MadejskiHotel
parsing uid = westconnex








 41%|████      | 44/108 [57:36<1:06:26, 62.29s/it][A[A[A[A[A[A2019-12-21 13:25:03.481 | INFO     | __main__:search_user_from_query:6 - query = Dushanbe-Uzbekistan Border Road Improvement


到头了








 42%|████▏     | 45/108 [59:20<1:18:35, 74.85s/it][A[A[A[A[A[A2019-12-21 13:26:47.647 | INFO     | __main__:search_user_from_query:6 - query = Nenskra Hydropower Plant


TimeoutException








 43%|████▎     | 46/108 [1:01:06<1:26:55, 84.12s/it][A[A[A[A[A[A2019-12-21 13:28:33.389 | INFO     | __main__:search_user_from_query:6 - query = Amaravati Sustainable Capital City


TimeoutException








 44%|████▎     | 47/108 [1:02:49<1:31:34, 90.07s/it][A[A[A[A[A[A2019-12-21 13:30:17.348 | INFO     | __main__:search_user_from_query:6 - query = Madhya Pradesh Rural Connectivity Project


TimeoutException








 44%|████▍     | 48/108 [1:04:34<1:34:20, 94.34s/it][A[A[A[A[A[A2019-12-21 13:32:01.659 | INFO     | __main__:search_user_from_query:6 - query = Mumbai Metro Line 4


TimeoutException








 45%|████▌     | 49/108 [1:06:18<1:35:38, 97.27s/it][A[A[A[A[A[A2019-12-21 13:33:45.748 | INFO     | __main__:search_user_from_query:6 - query = Sahiwal 2x660MW Coal-fired Power Plant


TimeoutException








 46%|████▋     | 50/108 [1:08:02<1:36:01, 99.33s/it][A[A[A[A[A[A2019-12-21 13:35:29.906 | INFO     | __main__:search_user_from_query:6 - query = UEP 100MW Wind Farm


TimeoutException








 47%|████▋     | 51/108 [1:09:46<1:35:39, 100.70s/it][A[A[A[A[A[A2019-12-21 13:37:13.791 | INFO     | __main__:search_user_from_query:6 - query = Sachal 50MW Wind Farm


TimeoutException








 48%|████▊     | 52/108 [1:11:30<1:34:57, 101.74s/it][A[A[A[A[A[A2019-12-21 13:38:57.953 | INFO     | __main__:search_user_from_query:6 - query = Peshawar-Karachi Motorway


TimeoutException








 49%|████▉     | 53/108 [1:13:14<1:33:51, 102.40s/it][A[A[A[A[A[A2019-12-21 13:40:41.894 | INFO     | __main__:search_user_from_query:6 - query = Havelian Dry Port


TimeoutException








 50%|█████     | 54/108 [1:14:58<1:32:32, 102.83s/it][A[A[A[A[A[A2019-12-21 13:42:25.730 | INFO     | __main__:search_user_from_query:6 - query = Gwadar International Airport


TimeoutException








 51%|█████     | 55/108 [1:16:42<1:31:06, 103.14s/it][A[A[A[A[A[A2019-12-21 13:44:09.590 | INFO     | __main__:search_user_from_query:6 - query = Myitsone Dam


TimeoutException








 52%|█████▏    | 56/108 [1:18:26<1:29:36, 103.40s/it][A[A[A[A[A[A2019-12-21 13:45:53.585 | INFO     | __main__:search_user_from_query:6 - query = Balloki Power Plant


TimeoutException








 53%|█████▎    | 57/108 [1:20:10<1:28:01, 103.56s/it][A[A[A[A[A[A2019-12-21 13:47:37.518 | INFO     | __main__:search_user_from_query:6 - query = Gadani Power Project


TimeoutException








 54%|█████▎    | 58/108 [1:21:54<1:26:23, 103.66s/it][A[A[A[A[A[A2019-12-21 13:49:21.427 | INFO     | __main__:search_user_from_query:6 - query = Hakla–Dera Ismail Khan Motorway


TimeoutException








 55%|█████▍    | 59/108 [1:23:38<1:24:45, 103.79s/it][A[A[A[A[A[A2019-12-21 13:51:05.519 | INFO     | __main__:search_user_from_query:6 - query = Khunjerab Railway


TimeoutException








 56%|█████▌    | 60/108 [1:25:22<1:23:03, 103.82s/it][A[A[A[A[A[A2019-12-21 13:52:49.390 | INFO     | __main__:search_user_from_query:6 - query = M5 Motorway


TimeoutException
parsing uid = TrafficNewsM5
parsing uid = HenburyGC
parsing uid = M5TrafficUK
parsing uid = westconnex
parsing uid = BWTiverton
parsing uid = TrafficNewsM5
parsing uid = HenburyGC
parsing uid = M5TrafficUK
parsing uid = westconnex
parsing uid = BWTiverton








 56%|█████▋    | 61/108 [1:25:38<1:00:51, 77.68s/it] [A[A[A[A[A[A2019-12-21 13:53:06.099 | INFO     | __main__:search_user_from_query:6 - query = M8 Motorway


到头了
parsing uid = M8motorway
parsing uid = M8_Scotland
parsing uid = M8motorway
parsing uid = M8_Scotland








 57%|█████▋    | 62/108 [1:25:55<45:29, 59.33s/it]  [A[A[A[A[A[A2019-12-21 13:53:22.602 | INFO     | __main__:search_user_from_query:6 - query = Matiari–Lahore Transmission Line


到头了








 58%|█████▊    | 63/108 [1:27:39<54:34, 72.76s/it][A[A[A[A[A[A2019-12-21 13:55:06.711 | INFO     | __main__:search_user_from_query:6 - query = Orange Line Lahore Metro


TimeoutException








 59%|█████▉    | 64/108 [1:29:23<1:00:14, 82.15s/it][A[A[A[A[A[A2019-12-21 13:56:50.757 | INFO     | __main__:search_user_from_query:6 - query = Pak-China Technical and Vocational Institute


TimeoutException








 60%|██████    | 65/108 [1:31:07<1:03:36, 88.76s/it][A[A[A[A[A[A2019-12-21 13:58:34.950 | INFO     | __main__:search_user_from_query:6 - query = Pakistan Port Qasim Power Project


TimeoutException








 61%|██████    | 66/108 [1:32:51<1:05:20, 93.33s/it][A[A[A[A[A[A2019-12-21 14:00:18.953 | INFO     | __main__:search_user_from_query:6 - query = Quaid-e-Azam Solar Park


TimeoutException


['Doraleh Multi-Purpose Port',
 'Forest City',
 'Melaka Gateway',
 'Gwadar Port',
 'Belgrade-Montenegro Bar Port Motorway',
 'Port City Colombo']

##### 采集用户主页推文

In [198]:
finish_user_list = []
bad_query_list = []

query_user_list = [u for u in user_table.find({"query":query})][:1]# query 不为空的user

MAX_TWEET_SIZE = 50
time_interval = "2019年1月1日" # 默认截止到今日
# search_tweet_from_profile(browser,query_user_list,finish_user_list)