In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import csv
import re

In [10]:
def replace_chars(string, old, new):
    for i in range(len(old)):
        string = string.replace(old[i], new[i])
    return string

def remove_non_utf8(text):
    # Define a regex pattern to match non-UTF-8 characters
    utf8_pattern = re.compile(r'[^\x00-\x7F]+')

    # Use the pattern to replace non-UTF-8 characters with an empty string
    cleaned_text = utf8_pattern.sub('', text)

    return cleaned_text

def get_poet_details(driver):
    # Set up WebDriverWait
    wait = WebDriverWait(driver, 10)

    # Initialize variables
    gold, silver, bronze, hm, membership, region, rank = [0, 0, 0, 0, 'Free', None, None]
    pos = 0

    # Get the next user ID from the current length of the CSV file
    user_id = pd.read_csv('poets.csv').shape[0] + 1
    
    # Get the name of the poet
    name_selector = 'h1.notop'
    name = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, name_selector))).text
    
    # Get the number of followers and following
    followers_selector = 'followers'
    followers_text = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, followers_selector))).text
    followers = int(replace_chars(followers_text, [followers_selector, ','], ['','']))
    
    following_selector = 'following'
    following_text = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, following_selector))).text
    following = int(replace_chars(following_text, [following_selector, ','], ['','']))
    
    # Get the number of messages sent and received
    message_out_selector = 'a[href="/comment/by/{}"]'.format(name)
    message_out_text = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, message_out_selector))).text
    message_out = int(replace_chars(message_out_text, [','], ['']))
    
    message_in_selector = 'on me'
    message_in_text = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, message_in_selector))).text
    message_in = int(replace_chars(message_in_text, [message_in_selector, ','], ['','']))
    
    # Get the number of medals won
    medal_types_won = driver.find_elements(By.CSS_SELECTOR, '.well i.sprite')
    numbers = driver.execute_script("return document.getElementsByClassName('well')[1].innerText;")
    numbers = [x for x in numbers.split('\n')[-1].split(' ') if x != ""]
  
    for medal in medal_types_won:
        classes = medal.get_attribute("class")
        alt = medal.get_attribute("alt")

        if 'tr-gold' in classes:
            gold = int(replace_chars(numbers[pos], [','], ['']))
            pos += 1
            
        if 'tr-silver' in classes:
            silver = int(replace_chars(numbers[pos], [','], ['']))
            pos += 1
            
        if 'tr-bronze' in classes:
            bronze = int(replace_chars(numbers[pos], [','], ['']))
            pos += 1
            
        if 'tr-honorable' in classes:
            hm = int(replace_chars(numbers[pos], [','], ['']))
            pos += 1
            
        # Get membership
        if "leaf" in classes:
            membership = replace_chars(classes, ['sprite ', 'leaf-'], ['',''])
    
    # Get region
    region_selector = 'span[itemprop="region"]'
    try:
        region_elem = driver.find_element(By.CSS_SELECTOR, region_selector)
        if region_elem and region_elem != "":
            region = region_elem.text
    except:
        pass
    
    # Get rank (level)
    rank_selector = 'a[href="/home/level/{}"]'.format(name)
    try:
        rank_elem = driver.find_element(By.CSS_SELECTOR, rank_selector)
        if rank_elem and rank_elem != "":
            rank = rank_elem.text
    except:
        pass
    
    # save row and write to the csv file
    values = [user_id, followers, following, message_out, message_in, membership, rank, region, gold, silver, bronze, hm]
    row = ",".join([str(x) for x in values])
    
    with open("poets.csv","a") as file:
        file.write("\n"+ row)
        
    return user_id

In [11]:
def get_poems(driver, link, user_id):
    driver.get(link)
    
    user = link.split('-by-')[-1]
    
    # Get the next poem ID from the current length of the CSV file
    poem_id = pd.read_csv('poems.csv').shape[0] + 1
    
    # Get the next comment ID from the current length of the CSV file
    comment_id = pd.read_csv('comments.csv').shape[0] + 1
    
    # get poem title 
    title = driver.find_element(By.CSS_SELECTOR, 'a.nocolor.fn').text
    title = remove_non_utf8(title)
    title = replace_chars(title, ',', '*') # replace comma with *
    
    # get poem body 
    poem = driver.find_elements(By.CSS_SELECTOR, ".poem_body div")[1].text
    poem = remove_non_utf8(poem)
    poem = replace_chars(poem, ',', '*') # replace comma with *
    poem = replace_chars(poem, '\n', '**') # replace \n with **
    
    
    #get number of likes 
    likes = driver.find_element(By.CSS_SELECTOR, "a.like_above .num").text

    # get comments
    comments = driver.find_elements(By.CSS_SELECTOR, ".media-body div")
    
    # save poem to csv file
    values = [poem_id, user_id, title, poem, likes]
    row = ",".join([str(x) for x in values])
    
    with open("poems.csv","a") as file:
        file.write("\n"+ row)
    
    # save comments in csv
    with open("comments.csv","a") as file:
        for comment in comments:
            comment = " ".join(comment.text.split('\n')[:-1])

            if user not in comment and len(comment) > 1:
                comment = " ".join(comment.split('-')[1:])
                comment = remove_non_utf8(comment)
                comment = replace_chars(comment, ',', '*') # replace comma with *
                comment = replace_chars(comment, '\n', '**') # replace \n with **
                
                values = [comment_id, poem_id, comment]
                row = ",".join([str(x) for x in values])
                
                file.write("\n"+ row)
                
                comment_id = comment_id + 1

In [12]:
def scroll_and_get_poems(driver, user_id):
    poem_links = []
    
    # Set initial scroll height
    start_height = driver.execute_script("return window.innerHeight")
    page_height = driver.execute_script("return document.body.scrollHeight")
    cur_height = 0
    
    #get the total number of poems the poet has 
    total = driver.find_element(By.CSS_SELECTOR, 'div.big_nav.toggle_tabs a.btn--link.get.active').text
    poem_count = int(total.split('(')[1][:-1])

    while True:
        new_divs = []
        
        # Wait for some time to let new elements load (adjust as needed)
        driver.implicitly_wait(10)
        
        # Collect div elements
        poem_tiles = driver.find_elements(By.CSS_SELECTOR, 'a.nocolor.fn')
        for title in poem_tiles:
            link = title.get_attribute('href')
            if link not in poem_links:
                poem_links.append(link)
        
        # Scroll down
        cur_height = cur_height + start_height
        driver.execute_script("window.scrollTo(0, {});".format(cur_height))
        
        if len(poem_links) % 10 == 1:
            print(len(poem_links))

        # Update scroll height
        if len(poem_links) >= poem_count or len(poem_links) >= 20:
            break
            
    print("****************** link collection done **********")
    
    
    for link in poem_links:
        get_poems(driver, link, user_id)
        
    print(user_id, "  ****************** poems collection done **********")

In [13]:
def scrap(username, password, page_no, stop_page):
    driver = webdriver.Chrome()
    driver.get('https://allpoetry.com/favorite/followers/kevin')
    wait = WebDriverWait(driver, 10)

    # login with username and password
    driver.find_element(By.ID, 'user_name').send_keys(username)
    driver.find_element(By.ID, 'user_password').send_keys(password)
    driver.find_element(By.CSS_SELECTOR, '.media-body input.btn').send_keys(Keys.RETURN)
    
    # select the exact page to start scrapping from clicking the next button
    for i in range(page_no - 1):
        wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'next_page'))).click()

    # loop through all pages
    while page_no < stop_page:
        #get each poet
        wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'lines')))
        poets = driver.find_elements(By.CSS_SELECTOR, '.lines .itm')

        poet_ids = [] # contains processed poet id; All space replaced with underscores

        for poet in poets:
            poet_name = poet.find_element(By.TAG_NAME, 'a').text
            poet_ids.append(poet_name.replace(' ', '_'))

        for poet_id in poet_ids:
            try:
                driver.get('https://allpoetry.com/{}'.format(poet_id))
                user_id = get_poet_details(driver)
                scroll_and_get_poems(driver, user_id)
            except Exception as e:
                print(e)
                pass
        
        # go to the next page
        print(page_no, ' Done ************************************')
        page_no = page_no + 1
        wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'next_page'))).click()

In [14]:
scrap('Abu Abdulkadir', '50505050505050', 1, 79)

21
****************** link collection done **********
Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601BDFB39]
	(No symbol) [0x00007FF601BD25DD]
	(No symbol) [0x00007FF601BD16FD]
	(No symbol) [0x00007FF601BD0746]
	(No symbol) [0x00007FF601BD06EB]
	(No symbol) [0x00007FF601BCEE3D]
	(No symbol) [0x00007FF601BCF603]
	(No symbol) [0x00007FF601BEA0FD]
	(No symbol) [0x00007FF601C72E01]
	(No symbol) [0x00007FF601C55FEA]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007F

Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601BDFB39]
	(No symbol) [0x00007FF601BD25DD]
	(No symbol) [0x00007FF601BD16FD]
	(No symbol) [0x00007FF601BD0746]
	(No symbol) [0x00007FF601BD06EB]
	(No symbol) [0x00007FF601BCEE3D]
	(No symbol) [0x00007FF601BCF603]
	(No symbol) [0x00007FF601BEA0FD]
	(No symbol) [0x00007FF601C72E01]
	(No symbol) [0x00007FF601C55FEA]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007FF601D48044]
	(No symbol) [0x00007FF601D481C9]
	(No sym

Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601BDFB39]
	(No symbol) [0x00007FF601BD25DD]
	(No symbol) [0x00007FF601BD16FD]
	(No symbol) [0x00007FF601BD0746]
	(No symbol) [0x00007FF601BD06EB]
	(No symbol) [0x00007FF601BCEE3D]
	(No symbol) [0x00007FF601BCF603]
	(No symbol) [0x00007FF601BEA0FD]
	(No symbol) [0x00007FF601C72E01]
	(No symbol) [0x00007FF601C55FEA]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007FF601D48044]
	(No symbol) [0x00007FF601D481C9]
	(No sym

Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601BDFB39]
	(No symbol) [0x00007FF601BD25DD]
	(No symbol) [0x00007FF601BD16FD]
	(No symbol) [0x00007FF601BD0746]
	(No symbol) [0x00007FF601BD06EB]
	(No symbol) [0x00007FF601BCEE3D]
	(No symbol) [0x00007FF601BCF603]
	(No symbol) [0x00007FF601BEA0FD]
	(No symbol) [0x00007FF601C72E01]
	(No symbol) [0x00007FF601C55FEA]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007FF601D48044]
	(No symbol) [0x00007FF601D481C9]
	(No sym

Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601BDFB39]
	(No symbol) [0x00007FF601BD25DD]
	(No symbol) [0x00007FF601BD16FD]
	(No symbol) [0x00007FF601BD0746]
	(No symbol) [0x00007FF601BD06EB]
	(No symbol) [0x00007FF601BCEE3D]
	(No symbol) [0x00007FF601BCF603]
	(No symbol) [0x00007FF601BEA0FD]
	(No symbol) [0x00007FF601C72E01]
	(No symbol) [0x00007FF601C55FEA]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007FF601D48044]
	(No symbol) [0x00007FF601D481C9]
	(No sym

Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601BDFB39]
	(No symbol) [0x00007FF601BD25DD]
	(No symbol) [0x00007FF601BD16FD]
	(No symbol) [0x00007FF601BD0746]
	(No symbol) [0x00007FF601BD06EB]
	(No symbol) [0x00007FF601BCEE3D]
	(No symbol) [0x00007FF601BCF603]
	(No symbol) [0x00007FF601BEA0FD]
	(No symbol) [0x00007FF601C72E01]
	(No symbol) [0x00007FF601C55FEA]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007FF601D48044]
	(No symbol) [0x00007FF601D481C9]
	(No sym

Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=120.0.6099.225)
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601BDFB39]
	(No symbol) [0x00007FF601BD25DD]
	(No symbol) [0x00007FF601BD16FD]
	(No symbol) [0x00007FF601BD0746]
	(No symbol) [0x00007FF601BD06EB]
	(No symbol) [0x00007FF601BCEE3D]
	(No symbol) [0x00007FF601BCF603]
	(No symbol) [0x00007FF601BEA0FD]
	(No symbol) [0x00007FF601C72E01]
	(No symbol) [0x00007FF601C55FEA]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007FF601D48044]
	(No symbol) [0x00007FF601D481C9]
	(No sym

TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF602122142+3514994]
	(No symbol) [0x00007FF601D40CE2]
	(No symbol) [0x00007FF601BE76AA]
	(No symbol) [0x00007FF601C31860]
	(No symbol) [0x00007FF601C3197C]
	(No symbol) [0x00007FF601C74EE7]
	(No symbol) [0x00007FF601C5602F]
	(No symbol) [0x00007FF601C728F6]
	(No symbol) [0x00007FF601C55D93]
	(No symbol) [0x00007FF601C24BDC]
	(No symbol) [0x00007FF601C25C64]
	GetHandleVerifier [0x00007FF60214E16B+3695259]
	GetHandleVerifier [0x00007FF6021A6737+4057191]
	GetHandleVerifier [0x00007FF60219E4E3+4023827]
	GetHandleVerifier [0x00007FF601E704F9+689705]
	(No symbol) [0x00007FF601D4C048]
	(No symbol) [0x00007FF601D48044]
	(No symbol) [0x00007FF601D481C9]
	(No symbol) [0x00007FF601D388C4]
	BaseThreadInitThunk [0x00007FFA5D3B7344+20]
	RtlUserThreadStart [0x00007FFA5DB626B1+33]


In [None]:
# pd.read_csv('poets.csv')