In [1]:
import time
import json
import os
from bs4 import BeautifulSoup
import bs4
from urllib.request import urlopen
from collections import defaultdict
import re
from helper import *
import pandas as pd
import csv

In [2]:
def download_parallel_sents(user_id, journal_id): 
    """Get the original sentences with their corrections, it might be 1-N mapping."""
    print(f"{'*'*20} {journal_id} {'*'*20}")
    url = f"https://lang-8.com/{user_id}/journals/{journal_id}"
    soup = BeautifulSoup(urlopen(url), "html.parser")
    corrections = get_corrections(soup)
    return get_pair_sents(corrections)

In [3]:
def scrape_metadata(output_file, todo_users, done_users_list, total_count=5, mode="a"):
    """Scrape users from todo_users list and write data to output_file."""
    user_count = 0
    while todo_users:
        current_user = todo_users.pop()
        success = False
        count = 0
        result = {}
        print(current_user)
        while not success and count < 10:
            try:
                friends_page_soup = BeautifulSoup(urlopen(f'https://lang-8.com/{current_user}/friends'), 'html.parser')
                documents_page_soup = BeautifulSoup(urlopen(f'https://lang-8.com/{current_user}/journals'), 'html.parser')
                profile = get_profile(friends_page_soup)
                friends = get_friends(friends_page_soup)
                print("friends: " + str(friends))
                documents = get_documents(documents_page_soup)
                fout = open(output_file, mode=mode, encoding="utf-8")
                fout.write("<user_id> " + current_user + "\n")
                for k, v in profile.items():
                    fout.write(f"{k}: {v}\n")
                fout.write("friends: " + str(friends) + "\n")
                fout.write("documents: " + str(documents) + "\n")
                fout.close()
                user_count += 1
                success = True
            except:
                print(count, " fail!")
                count += 1
                time.sleep(1)

        if count == 10:
            continue

        for friend in friends:
            if friend not in done_users_list:
                todo_users.add(friend)
        
        fout = open(todo_users_file, "w")
        json.dump(list(todo_users),fout)
        fout.close()
        
        done_users_list.add(current_user)
        
        if user_count == total_count:
            break
            
        time.sleep(1)

In [10]:
def read_user_doc_id_pairs(user_profile, total_count=10):
    """Read the user_id and doc_ids pairs from user_profile. Specify total_count to limit the counts."""
    count = 0
    user_id, document_ids = None, None
    with open(user_profile) as f:
        for line in f:
            line = line.strip()
            match_user = re.search(r"<user_id> (.*)", line)
            match_docs = re.search(r"documents: (\[.*\])", line)
            if match_user:
                user_id = match_user.group(1)
            if match_docs:
                document_ids = match_docs.group(1).strip("[]").split(", ")
            if user_id and document_ids:
                count += 1
                yield user_id, document_ids
                user_id, document_ids = None, None
                
            if count == total_count:
                break

In [11]:
def scrape_parralel_sents(user_profile, pral_corpus, total_count=100, word_count=0):
    """Scrape parallel sentences from user_profile. Specify total_count to limit counts."""
    doc_count = 0
    for user_id, doc_ids in read_user_doc_id_pairs(user_profile, total_count):
        for journal_id in doc_ids:
            fail_count = 0
            success = False
            while not success and fail_count < 10:
                try:
                    journal_id = journal_id.strip("'")
                    sent_pairs = download_parallel_sents(user_id=user_id, journal_id=journal_id)
                    print(user_id, journal_id)
                    with open(pral_corpus, 'a', encoding="utf-8") as fout:
#                         pdb.set_trace()
                        headers = ["user_id", "journal_id", "original", "corrected"]
                        writer = csv.DictWriter(fout, headers)
                        for k, v in sent_pairs.items():
                            
                            if re.match(r"^[a-zA-Z0-9. -_?]*$", k):
                                for i in range(len(v)):
                                    if re.match(r"^[a-zA-Z0-9. -_?]*$", v[i]):
                                        row = {"user_id": user_id, "journal_id": journal_id, "original": k, "corrected": v[i]}
                                        writer.writerow(row)
                                
#                                 fout.write(f"<user_id> {user_id} , <doc_id> {journal_id}\n")
#                                 fout.write(f"org: {k}\n")
#                                 fout.write(f"cor: {v}\n")
#                                 fout.write("\n")
                                        word_count += len(k.split())
                                        print(f"org: {k}")
                                        print(word_count)
                            if word_count >= TOTAL_WORDS:
                                return word_count
                    doc_count += 1
                    success = True
                except:
                    print(fail_count, "fail")
                    fail_count += 1
                    time.sleep(1)

                    if fail_count == 10:
                        continue
                        
            if doc_count == total_count:
                return word_count
            
            time.sleep(1)
            
    return word_count

In [14]:
### Run this section to check POC, scrape one user profile ###

# test_user_id = "191274"
test_user_id = "125710"
test_doc_id = "271352583976235145865979226599667907075"
user_link = "https://lang-8.com/191274/friends"
journal_link = "https://lang-8.com/191274/journals/271352583976235145865979226599667907075"

In [15]:
### Run this section to check POC, scrape one user profile ###
scrape_metadata("./data/test_user_meta_2.txt", set([test_user_id]), done_users_list=set(), total_count=1)

125710
https://lang-8.com/125710/friends?page=2
https://lang-8.com/125710/friends?page=3
https://lang-8.com/125710/friends?page=4
https://lang-8.com/125710/friends?page=5
friends: ['431202', '1630244', '90486', '1544569', '256735', '151424', '544617', '1733255', '280068', '622783', '253112', '1390758', '234347', '1599821', '743539', '913273', '1496765', '1578399', '1555051', '358816', '129055', '404653', '5417', '1751722', 'tomomonkey', '782455', '372565', '1208440', '39238', '905740', '1314036', '194676', '918996', '95475', '13518', '221098', '1239012', '273857', '141752', '292576', '780518', '448093', '1680889', '249798', '490762', '1306939', '637526', '1461619', '1161137', '1794286', '1406336', '1628972', '967670', '146438', '212759', '222217', '413541', '1368310', '1648501', '347501', '589401', '243220', '1024028', '1467321', '1300664', 'earo', '44217', '357535', '807008', '1355657', '197578', '1150844', '1513436', '363628', '145308', '1608723', '1487010', '1509073', '594169', '178

KeyboardInterrupt: 

In [16]:
### Run this section to check POC, scrape one document with parallel sents ###

scrape_parralel_sents("./data/test_user_meta.txt", "./data/test_paral_sents.txt", total_count=1)

FileNotFoundError: [Errno 2] No such file or directory: './data/test_user_meta.txt'

In [None]:
word_count = 0
TOTAL_WORDS = 100000
LOGGING_FILE = "./data/logging.txt"
user_profile = "./data/lang-8-users.txt"
todo_users_file = "./data/todo_users.txt"
corpus_file = "./data/paral_sents.csv"

todo_users = set(["213725","673707"])
done_users_list = set()

if os.path.exists(LOGGING_FILE):
    f = open(LOGGING_FILE)
    word_count = int(f.read().split(":")[-1].strip())
    f.close()
print(f"cumulative word count: {word_count}")

if os.path.exists(user_profile):
    f = open(user_profile, encoding='utf-8')
    for line in f:
        matched = re.search(r"<user_id> (.*)", line.strip())
        if matched:
            done_users_list.add(matched.group(1))
    f.close()
print(f"Done users list: {done_users_list}")
            
if os.path.exists(todo_users_file):
    f = open(todo_users_file,encoding="utf-8")
    todo_users = set(json.load(f))
    f.close()
print(f"todo users list: {todo_users}")
    
while word_count < TOTAL_WORDS:
    scrape_metadata(user_profile, todo_users, done_users_list, total_count=1, mode="w")
    count = scrape_parralel_sents(user_profile, corpus_file, word_count=word_count)
    word_count += count
    with open(LOGGING_FILE, 'w') as fout:
        fout.write(f"Number of words collected: {word_count}")

cumulative word count: 0
Done users list: set()
todo users list: {'213725', '673707'}
213725
friends: ['253534', '125710', 'phong', '4']
******************** 256361636188698521397028321337896574465 ********************
213725 256361636188698521397028321337896574465
org: I bought it, when we got my house built last month.
11
org: It was one of my wish lists, as my hands got chapped from detergent.
25
org: I came across an article the other day and they say doing washes is of good importance to remove your stresses.
46
org: But for me, doing washes is just an unpleasant job to give me a lot of band-aid.
63
******************** 66790772351198343205338104156873537443 ********************
213725 66790772351198343205338104156873537443
org: I went to see my old friends from college .
73
org: It ended up taking more than 2hours to get there.
83
org: It ended up taking more than 2hours to get there.
93
org: After we both got married and had own family, we gradually stop keeping touch than befor