In [160]:
import re
import time
import datetime
import pymongo
from tqdm import tqdm
import copy
from loguru import logger

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException

class Tweet:

    def __init__(self, query, uid, ptime, pcontent, padditional, nb_reply, nb_retweet, nb_favorite):
        self.query = query
        self.uid = uid
        self.ptime = ptime
        self.pcontent = pcontent
        self.padditional = padditional  # 转发推文，文章链接，图片，视频
        self.nb_retweet = nb_retweet  # nbr of retweet
        self.nb_favorite = nb_favorite  # nbr of favorite
        self.nb_reply = nb_reply    # nbr of reply

    def __repr__(self):
        return "Tweet={}\nQuery={}".format(self.pcontent, self.query)


class User:

    def __init__(self, profile_url):
        self.profile_url = profile_url
        self.ID = profile_url.split('/')[-1]
        self.name = ''
        self.avatar = ''
        self.query = ''# query相关的大V用户
        self.intro = ''
        

    def __repr__(self):
        return "User {}".format(self.ID)


def compare_time(time1,time2):
    s_time = time.mktime(time.strptime(time1,'%Y年%m月%d日'))
    e_time = time.mktime(time.strptime(time2,'%Y年%m月%d日'))
    return int(s_time) - int(e_time)
    
def convert_time(x):
    '''
    for x in ['20分钟','1小时','1天', '10月10日','2018年10月1日']:
        print(convert_time(x))
    '''
    now = datetime.datetime.now()
    pattern = r'\d{4}年\d+月\d+日'
    if re.match(pattern, x):
        return x
    pattern = r'\d+月\d+日'
    if re.match(pattern, x):
        return "{}年".format(now.year)+x
    return "{}年{}月{}日".format(now.year, now.month, now.day)

def is_non_result(browser):
    '''
    判断结果是否为空
    '''
    result_div_xpath = "//div[@id='react-root']"
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    try:
        result_div = browser.find_element_by_xpath(result_div_xpath)
        return '没有符合搜索条件的结果' in result_div.text
    except NoSuchElementException as e:
        return False

def get_search_input_v1(browser):
    # 定位搜索框
    search_input_xpath = "//input[@placeholder='搜索 Twitter']"
    wait.until(EC.presence_of_element_located((By.XPATH, search_input_xpath)))
    search_input = browser.find_element_by_xpath(search_input_xpath)
    return search_input

# def get_search_input_v2(browser):
#     # 请求主站
#     browser.get('https://twitter.com/search-home')
#     # 定位搜索框
#     search_input_id = 'search-home-input'
#     wait.until(EC.presence_of_element_located((By.ID, search_input_id)))
#     search_input = browser.find_element_by_id(search_input_id)
#     return search_input


def extract_reply_retweet_favorite(element):
    t = []
    for x in element.find_elements_by_xpath('./div')[:3]:
        if x.text.strip() == '':
            t.append(0)
        else:
            t.append(int(x.text.strip()))
    return tuple(t)


#### query -> 推文爬取

In [143]:
def parse_tweet_result_div(result_div,query):
    count = 0
    for div in result_div:
        user, tweet = div.find_elements_by_xpath('./div')
        profile_url = user.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        a, *b_c, d = tweet.find_elements_by_xpath('./div')  # 按照div分为>=3层
        ptime = a.find_elements_by_tag_name('a')[-1].text
        ptime = convert_time(ptime)
        nb_reply, nb_retweet, nb_favorite = 0,0,0
        try:
            nb_reply, nb_retweet, nb_favorite = extract_reply_retweet_favorite(
                d)
        except:
            nb_reply, nb_retweet, nb_favorite = 0, 0, 0
        pcontent = b_c[0].text
        padditional = []
        if len(b_c) > 1:
            for x in b_c[1:]:
                try:
                    a = x.find_element_by_tag_name('a').get_attribute('href')
                    padditional.append(a)
                except NoSuchElementException as e:
                    padditional.append(x.text.strip())
        user = User(profile_url)
        tweet = Tweet(query, uid, ptime, pcontent, padditional,
                      nb_reply, nb_retweet, nb_favorite)
        # save to databse
        user_dict = user.__dict__
        user_dict['_id']=user_dict['ID']
        if user_table.update_one({'_id': user_dict['_id']}, {'$set': user_dict}, upsert=True) and tweet_table.insert_one(tweet.__dict__):
            count += 1
    return count

def crawl_tweet(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="tweet"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    # 解析结果
    count += parse_tweet_result_div(result_div,query)
    while count < MAX_TWEET_SIZE:
        logger.info("{}/{}".format(count,MAX_TWEET_SIZE))
        result_div_xpath = '//div[@data-testid="tweet"]'
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        try:
            count += parse_tweet_result_div(result_div,query)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break

def search_tweet_from_query(browser,query_list):
    '''
    更加query采集推文
    '''
    for query in tqdm(query_list):
        logger.info('query = {}'.format(query))
        browser.get('https://twitter.com/explore')

        # 定位搜索框
        if browser.current_url == 'https://twitter.com/explore':
            search_input = get_search_input_v1(browser)
        else:
            print('error')
            return
        # 搜索query
        search_input.clear()
        search_input.send_keys(query)
        search_input.send_keys(Keys.ENTER)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(1)
        crawl_tweet(browser, query)
    print(bad_query_list)
    

#### query -> 爬取相关用户

In [184]:
def search_user_from_query(browser,query_list):
    '''
    根据query采集maxsize用户列表
    '''
    for query in tqdm(query_list):
        logger.info('query = {}'.format(query))
        browser.get('https://twitter.com/explore')

        # 定位搜索框
        if browser.current_url == 'https://twitter.com/explore':
            search_input = get_search_input_v1(browser)
        else:
            print('error')
            return
        # 搜索query
        search_input.clear()
        search_input.send_keys(query)
        search_input.send_keys(Keys.ENTER)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(1)
        # 请求用户结果页面
        browser.get(browser.current_url + '&f=user')
        crawl_user(browser,query)
    
    print(bad_query_list)

def parse_user_result_div(result_div,query):
    count = 0
    for t in result_div:
        left, right = t.find_elements_by_xpath('./div/div')
        profile_url = left.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        print('parsing uid = {}'.format(uid))
        avatar = left.find_element_by_tag_name('img').get_attribute('src')
        a, *b = right.find_elements_by_xpath('./div')  # 按照div分为>=3层
        uname= a.text.split('\n')[0]
        intro = ''
        if len(b)!=0:
            intro = b[-1].text.strip()
        
        user = User(profile_url)
        user.ID = uid
        user.avatar = avatar
        user.name = uname
        user.intro = intro
        user.query = query
        
        user_dict = user.__dict__
        user_dict['_id']=user_dict['ID']

        # save to databse
        if user_table.update_one({'_id': user_dict['_id']}, {'$set': user_dict}, upsert=True):
            count += 1
    return count

def crawl_user(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="UserCell"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    count += parse_user_result_div(result_div,query)
    
    while count < MAX_USER_SIZE:
        logger.info("{}/{}".format(count,MAX_USER_SIZE))
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        
        try:
            count += parse_user_result_div(result_div,query)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break

#### 爬取特定用户的推文（时间间隔内）

In [193]:
def search_tweet_from_profile(browser,query_user_list):
    for user in tqdm(query_user_list):
        logger.info('user = {}'.format(u['_id']))
        user_profile = 'https://twitter.com/'+ u['_id']
        browser.get(user_profile)

        # 获取结果
        if is_non_result(browser):
            bad_query_list.append(query)
            continue
        time.sleep(1)
        crawl_tweet2(browser,query)
    print(bad_query_list)

def crawl_tweet2(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="tweet"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    # 解析结果
    count += parse_tweet_from_profile(result_div,query)
    while count < MAX_TWEET_SIZE:
        logger.info("{}/{}".format(count,MAX_TWEET_SIZE))
        result_div_xpath = '//div[@data-testid="tweet"]'
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        try:
            count += parse_tweet_from_profile(result_div,query)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break
def parse_tweet_from_profile(result_div,query):
    count = 0
    top = 0 # 置顶是个数目
    # 如果存在置顶推文则不考虑时间
    try:
        t = browser.find_elements_by_xpath('//div[@class="css-1dbjc4n r-1habvwh r-1iusvr4 r-16y2uox r-5f2r5o"]')
        top = len(t)
    except NoSuchElementException as e:
        pass
    
    for div in result_div:
        user, tweet = div.find_elements_by_xpath('./div')
        profile_url = user.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
        a, *b_c, d = tweet.find_elements_by_xpath('./div')  # 按照div分为>=3层
        ptime = a.find_elements_by_tag_name('a')[-1].text
        ptime = convert_time(ptime)
        
        # 无置顶推文则按照时间过滤
        top -= 1
        if top<0 and compare_time(ptime,time_interval) < 0:
            print('触发时间截止')
            return MAX_TWEET_SIZE # 使其直接达到数目规模，从而停止外层循环
        
        nb_reply, nb_retweet, nb_favorite = 0,0,0
        try:
            nb_reply, nb_retweet, nb_favorite = extract_reply_retweet_favorite(
                d)
        except:
            nb_reply, nb_retweet, nb_favorite = 0, 0, 0
        pcontent = b_c[0].text
        padditional = []
        if len(b_c) > 1:
            for x in b_c[1:]:
                try:
                    a = x.find_element_by_tag_name('a').get_attribute('href')
                    padditional.append(a)
                except NoSuchElementException as e:
                    padditional.append(x.text.strip())
        tweet = Tweet(query, uid, ptime, pcontent, padditional,
                      nb_reply, nb_retweet, nb_favorite)

        if tweet_table.insert_one(tweet.__dict__):
            count += 1
    return count

#### 启动浏览器并登陆

In [108]:
client = pymongo.MongoClient("mongodb://10.108.17.25:27017/")
twitter_db = client["twitter"]
user_table = twitter_db['user']
tweet_table = twitter_db['tweet']

# 打开浏览器
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 100)

# 人工登录
browser.get('https://twitter.com/')

In [109]:
browser.refresh()
time.sleep(2)

##### 采集推文

In [42]:
bad_query_list = []
MAX_TWEET_SIZE = 50
query_list = ['the belt and road']
# search_tweet_from_query(browser,query_list)

##### 采集用户

In [110]:
bad_query_list = []
MAX_USER_SIZE = 10
query_list = ['the belt and road']
# search_user_from_query(browser,query_list)

##### 采集用户主页推文

In [198]:
bad_query_list = []
query = "the belt and road"
query_user_list = [u for u in user_table.find({"query":query})][:1]

MAX_TWEET_SIZE = 50
time_interval = "2019年1月1日" # 默认截止到今日
# search_tweet_from_profile(browser,query_user_list)