In [1]:
import time
import json
import os
from bs4 import BeautifulSoup
import bs4
from urllib.request import urlopen
from collections import defaultdict
import re
from helper import *

In [2]:
def download_parallel_sents(user_id, journal_id): 
    """Get the original sentences with their corrections, it might be 1-N mapping."""
    print(f"{'*'*20} {journal_id} {'*'*20}")
    url = f"https://lang-8.com/{user_id}/journals/{journal_id}"
    soup = BeautifulSoup(urlopen(url), "html.parser")
    corrections = get_corrections(soup)
    return get_pair_sents(corrections)

In [3]:
def scrape_metadata(output_file, todo_users, done_users_list, total_count=100):
    """Scrape users from todo_users list and write data to output_file."""
    user_count = 0
    while todo_users:
        current_user = todo_users.pop()
        success = False
        count = 0
        result = {}
        print(current_user)
        while not success and count < 10:
            try:
                friends_page_soup = BeautifulSoup(urlopen(f'https://lang-8.com/{current_user}/friends'), 'html.parser')
                documents_page_soup = BeautifulSoup(urlopen(f'https://lang-8.com/{current_user}/journals'), 'html.parser')
                profile = get_profile(friends_page_soup)
                friends = get_friends(friends_page_soup)
                print("friends: " + str(friends))
                documents = get_documents(documents_page_soup)
                fout = open(output_file, "a", encoding="utf-8")
                fout.write("<user_id> " + current_user + "\n")
                for k, v in profile.items():
                    fout.write(f"{k}: {v}\n")
                fout.write("friends: " + str(friends) + "\n")
                fout.write("documents: " + str(documents) + "\n")
                fout.close()
                user_count += 1
                success = True
            except:
                print(count, " fail!")
                count += 1
                time.sleep(1)

        if count == 10:
            continue
            
        if user_count == total_count:
            break

        for friend in friends:
            if friend not in done_users_list:
                todo_users.add(friend)
        
        fout = open(todo_users_file,"w")
        json.dump(list(todo_users),fout)
        fout.close()
        
        done_users_list.add(current_user)
        time.sleep(1)

In [11]:
def read_user_doc_id_pairs(user_profile, total_count=10):
    """Read the user_id and doc_ids pairs from user_profile. Specify total_count to limit the counts."""
    count = 0
    user_id, document_ids = None, None
    with open(user_profile) as f:
        for line in f:
            line = line.strip()
            match_user = re.search(r"<user_id> (.*)", line)
            match_docs = re.search(r"documents: (\[.*\])", line)
            if match_user:
                user_id = match_user.group(1)
            if match_docs:
                document_ids = match_docs.group(1).strip("[]").split(", ")
            if user_id and document_ids:
                count += 1
                yield user_id, document_ids
                user_id, document_ids = None, None
                
            if count == total_count:
                break

In [12]:
def scrape_parralel_sents(user_profile, pral_corpus, total_count=100):
    """Scrape parallel sentences from user_profile. Specify total_count to limit counts."""
    doc_count = 0
    for user_id, doc_ids in read_user_doc_id_pairs(user_profile, total_count):
        for journal_id in doc_ids:
            fail_count = 0
            success = False
            while not success and fail_count < 10:
                try:
                    journal_id = journal_id.strip("'")
                    sent_pairs = download_parallel_sents(user_id=user_id, journal_id=journal_id)
                    print(user_id, journal_id)
                    print(sent_pairs)
                    with open(pral_corpus, 'a') as fout:
                        for k, v in sent_pairs.items():
                            fout.write(f"<user_id> {user_id} , <doc_id> {journal_id}\n")
                            fout.write(f"org: {k}\n")
                            fout.write(f"cor: {v}\n")
                            fout.write("\n")
                    doc_count += 1
                    success = True
                except:
                    print(fail_count, "fail")
                    fail_count += 1
                    time.sleep(1)

                    if fail_count == 10:
                        continue
                        
            if doc_count == total_count:
                break

In [24]:
### Run this section to check POC, scrape one user profile ###

test_user_id = "191274"
test_doc_id = "271352583976235145865979226599667907075"
user_link = "https://lang-8.com/191274/friends"
journal_link = "https://lang-8.com/191274/journals/271352583976235145865979226599667907075"

In [None]:
### Run this section to check POC, scrape one user profile ###

scrape_metadata("./data/test_user_meta.txt", set([test_user_id]), done_users_list=set(), total_count=1)

In [20]:
### Run this section to check POC, scrape one document with parallel sents ###

scrape_parralel_sents("./data/test_user_meta.txt", "./data/test_paral_sents.txt", total_count=1)

******************** 271352583976235145865979226599667907075 ********************
191274 271352583976235145865979226599667907075
defaultdict(<class 'list'>, {"I don't like to be asked a question what sort of clothes wearing.": ["I don't like to be asked  what sort of clothes I'm wearing."], 'If I had seen people wearing T-shirt dress, they must be so cute and fashonable people.': ['If I had seen people wearing T-shirt dresses, I would think they must be so cute and fashionable people.'], 'I can image that clothes like a pajama for me.': ['I can image that kind of clothing would be like  pajamas for me.'], 'Those are casual and suit in resort places.': ['Those are casual and suit places like resorts.']})


In [21]:
### Run this section to scrape corpus

user_profile = "./data/lang-8-users.txt"
todo_users_file = "./data/todo_users.txt"

todo_users = set(["213725","673707"])
done_users_list = set()

if os.path.exists(user_profile):
    f = open(user_profile, encoding='utf-8')
    for line in f:
        matched = re.search(r"<user_id> (.*)", line.strip())
        if matched:
            done_users_list.add(matched.group(1))
            
if os.path.exists(todo_users_file):
    f = open(todo_users_file,encoding="utf-8")
    todo_users = set(json.load(f))
    f.close()

In [22]:
scrape_metadata(user_profile, todo_users, done_users_list)

213725
friends: ['253534', '125710']
253534
friends: ['213725', '128630']
673707
friends: ['125710']
125710
friends: ['1630244', '256735', '151424', '280068', '1390758', '1599821', '1578399', '1555051', '358816', '129055', '404653', '5417', '1751722', 'tomomonkey']
1751722
0  fail!


KeyboardInterrupt: 

In [23]:
scrape_parralel_sents(user_profile, "./data/paral_sents.txt")

******************** 256361636188698521397028321337896574465 ********************
213725 256361636188698521397028321337896574465
defaultdict(<class 'list'>, {'I bought it, when we got my house built last month.': ['I bought it when we got my house built last month.'], 'It was one of my wish lists, as my hands got chapped from detergent.': ['It was an item on my wish list, as my hands got chapped from detergent.'], 'After using it, I realize that it is much more cleaner than washing by hands, and what’s more, I can reduce the time I spent doing washes.': ['After using it, I realize that it got the dishes much cleaner than washing by hand and what’s more, I can reduce the time I spent doing washes.'], 'My mother told me that she doesn’t need to buy it, because at first, she likes to doing washes.': ['My mother told me that she doesn’t need to buy one, because, first of all, she likes to doing washes.', 'My mother told me that she doesn’t need to buy, because first, she likes to do the di

KeyboardInterrupt: 