In [1]:
import time
import json
import os
from bs4 import BeautifulSoup
import bs4
from urllib.request import urlopen
from collections import defaultdict
import re
from helper import *
import pandas as pd
import csv
import traceback

In [17]:
TOTAL_WORDS = 1010000
LOGGING_FILE = "./data/logging.txt"
CURRENT_USER_FILE = "./data/current_user.txt"

user_profile = "./data/lang-8-users.csv"
todo_users_file = "./data/todo_users.txt"
corpus_file = "./data/paral_sents.csv"

In [2]:
def download_parallel_sents(user_id, journal_id): 
    """Get the original sentences with their corrections, it might be 1-N mapping.
    Args:
        user_id (str): A string represents user id
        journal_id (str): A string represents journal id
    Returns:
        dictionary: A dictionary which contains original sentence as key and a list of corrections as values
    """
    print(f"{'*'*20} {journal_id} {'*'*20}")
    url = f"https://lang-8.com/{user_id}/journals/{journal_id}"
    soup = BeautifulSoup(urlopen(url), "html.parser")
    corrections = get_corrections(soup)
    return get_pair_sents(corrections)

In [3]:
def read_user_doc_id_pairs(user_profile, total_count=10):
    """Read the user_id and doc_ids pairs from user_profile. Specify total_count to limit the counts.
    Args:
        user_profile (str): A filepath to the user profile
        total_count(int): Maximum number of <user_id, [journal_ids]> pairs to scrape
    Returns:
        A generator to serve user_id and its corresponding doc_ids list
    """
    count = 0
    user_id, document_ids = None, None
    with open(user_profile) as f:
        for line in f:
            line = line.strip()
            match_user = re.search(r"<user_id> (.*)", line)
            match_docs = re.search(r"documents: (\[.*\])", line)
            if match_user:
                user_id = match_user.group(1)
            if match_docs:
                document_ids = match_docs.group(1).strip("[]").split(", ")
            if user_id and document_ids:
                count += 1
                yield user_id, document_ids
                user_id, document_ids = None, None
                
            if count == total_count:
                break

In [4]:
def scrape_metadata(output_file, todo_users, done_users_list, total_count=5, mode="a"):
    """Scrape users from todo_users list and write data to output_file.
    Args:
        output_file (str): Filepath to store user metadata
        todo_users (list): A list which contains todo user_ids
        done_users_list (list): A list contains done user_ids
        total_count (int): Number of users to scrape
        mode (str): The mode to open output_file
    Returns:
        None (You should expect an ouput file created in your defined path)
    """
    user_count = 0
    while todo_users:
        current_user = todo_users.pop()
        success = False
        count = 0
        result = {}
        print(current_user)
        
        while not success and count < 10:
            try:
                # collect metadata of users
                friends_page_soup = BeautifulSoup(urlopen(f'https://lang-8.com/{current_user}/friends'), 'html.parser')
                documents_page_soup = BeautifulSoup(urlopen(f'https://lang-8.com/{current_user}/journals'), 'html.parser')
                profile = get_profile(friends_page_soup)
                friends = get_friends(friends_page_soup)
                print("friends: " + str(friends))
                documents = get_documents(documents_page_soup)
                
                # write to output_file
                with open(output_file, mode=mode, encoding="utf-8") as fout:
                    headers = ['Sex', 'Occupation', 'L points', 'ID', 'Nation and region', 'Location', 'Age']
                    writer = csv.DictWriter(fout, headers)
                    
                    write_profile = {k: profile[k] for k in headers if k in profile}
                    writer.writerow(write_profile)
                
                # keep track of current_user
                with open(CURRENT_USER_FILE, mode="w", encoding="utf-8") as fout:
                    fout.write("<user_id> " + current_user + "\n")
                    for k, v in profile.items():
                        fout.write(f"{k}: {v}\n")
                    fout.write("friends: " + str(friends) + "\n")
                    fout.write("documents: " + str(documents) + "\n")
                
                user_count += 1
                success = True
                
            except:
                print(count, " fail!")
                traceback.print_exc()
                count += 1
                time.sleep(1)
        
        # max number of tries to make if error happens
        if count == 10:
            continue
        
        # update the todo users list
        for friend in friends:
            if friend not in done_users_list:
                todo_users.add(friend)
        
        fout = open(todo_users_file, "w")
        json.dump(list(todo_users),fout)
        fout.close()
        
        # update done users list
        done_users_list.add(current_user)
        
        # if reaches max number of users to scrape, stop
        if user_count == total_count:
            break
            
        time.sleep(1)

In [5]:
def scrape_parralel_sents(user_profile, pral_corpus, total_count=100, word_count=0):
    """Scrape parallel sentences from user_profile. Specify total_count to limit counts.
    Args:
        user_profile (str): A filepath of users profile
        paral_corpus (str): A filepath to write the parallel sentences
        total_count (int): Number of journals to scrape
        word_count (int): Total word counts to keep track of
    Returns:
        int: Cumulative word count since scraping
    """
    doc_count = 0
    for user_id, doc_ids in read_user_doc_id_pairs(user_profile, total_count):
        for journal_id in doc_ids:
            fail_count = 0
            success = False
            
            while not success and fail_count < 10:
                try:
                    
                    journal_id = journal_id.strip("'")
                    if not journal_id:
                        break
                        
                    # get parallel sents from user_id and journal_id pair
                    sent_pairs = download_parallel_sents(user_id=user_id, journal_id=journal_id)
                    print(f"Now scraping: {user_id}, {journal_id}")
                    
                    # scraping parallel sents
                    with open(pral_corpus, 'a', encoding="utf-8") as fout:
                        headers = ["user_id", "journal_id", "original", "corrected"]
                        writer = csv.DictWriter(fout, headers)
                        for k, v in sent_pairs.items():
                            
                            # check if the sentence only ontains English chars
                            if re.match(r"^[a-zA-Z0-9. -_?]*$", k):
                                for i in range(len(v)):
                                    if re.match(r"^[a-zA-Z0-9. -_?]*$", v[i]):
                                        row = {"user_id": user_id, "journal_id": journal_id, "original": k, "corrected": v[i]}
                                        writer.writerow(row)
                                        word_count += len(k.split())
                                        print(f"org: {k}")
                                        print(word_count)
                            
                            # if reached the total number of tokens, stop
                            if word_count >= TOTAL_WORDS:
                                return word_count
                    doc_count += 1
                    success = True
                    
                except:
                    print(fail_count, "fail")
                    traceback.print_exc()
                    fail_count += 1
                    time.sleep(1)
                    
                    # max number of tries to make when error occurs
                    if fail_count == 10:
                        continue
               
            # if reached max number of journal counts, stop
            if doc_count == total_count:
                return word_count
            
            time.sleep(1)
            
    return word_count

### POC

In [66]:
### Run this section to check POC, scrape one user profile ###

# test_user_id = "191274"
# test_doc_id = "271352583976235145865979226599667907075"
# user_link = "https://lang-8.com/191274/friends"
# journal_link = "https://lang-8.com/191274/journals/271352583976235145865979226599667907075"

In [67]:
### Run this section to check POC, scrape one user profile ###

# scrape_metadata("./data/test_user_meta.txt", set([test_user_id]), done_users_list=set(), total_count=1)

In [68]:
### Run this section to check POC, scrape one document with parallel sents ###

# scrape_parralel_sents("./data/test_user_meta.txt", "./data/test_paral_sents.txt", total_count=1)

### Scraping whole corpus

In [10]:
# Run this part to scrape whole corpus

word_count = 0

todo_users = set(["213725","673707"])
done_users_list = set()

# Check how many words have been scraped
if os.path.exists(LOGGING_FILE):
    f = open(LOGGING_FILE)
    word_count = int(f.read().split(":")[-1].strip())
    f.close()
print(f"cumulative word count: {word_count}")

# Load the done users list
if os.path.exists(user_profile):
    f = open(user_profile, encoding='utf-8')
    for line in f:
        matched = re.search(r"<user_id> (.*)", line.strip())
        if matched:
            done_users_list.add(matched.group(1))
    f.close()
print(f"Done users list: {done_users_list}")
  
# load the todo users list
if os.path.exists(todo_users_file):
    f = open(todo_users_file,encoding="utf-8")
    todo_users = set(json.load(f))
    f.close()
print(f"todo users list: {todo_users}")

# Scrape the whole corpus with TOTAL_WORDS
while word_count < TOTAL_WORDS:
    
    scrape_metadata(user_profile, todo_users, done_users_list, total_count=1, mode="a+")
    count = scrape_parralel_sents(CURRENT_USER_FILE, corpus_file, word_count=word_count)
    
    word_count = count
    print(f"Current count is: {word_count}")
    
    # Keep track of total words count
    with open(LOGGING_FILE, 'w') as fout:
        fout.write(f"Number of words collected: {word_count}")

### Data Preprocessing

In [20]:
parl_sents_df = pd.read_csv(
    corpus_file,
    names=["user_id", "doc_id", "original", "corrected"],
    header=None,
)

user_df = pd.read_csv(
    user_profile,
    names=[
        "sex",
        "occupation",
        "lpoints",
        "user_id",
        "nation_region",
        "location",
        "age",
    ],
    header=None,
)

print("Parallel sents: ", parl_sents_df.shape)
print("User metadata: ", user_df.shape)

Parallel sents:  (77442, 4)
User metadata:  (3236, 7)


In [21]:
parl_sents_df.head()

Unnamed: 0,user_id,doc_id,original,corrected
0,673707,132180776574210182836253618202132330403,Sun room,Sunroom
1,673707,132180776574210182836253618202132330403,I've always wanted a sun room for a long time.,I've always wanted a sunroom for a long time.
2,673707,132180776574210182836253618202132330403,It's because it can dry laundry even if it rai...,It's because I can dry the laundry there even ...
3,673707,132180776574210182836253618202132330403,"This time, our city will subsidize 10% of reno...","This time, our city will subsidize 10% of reno..."
4,673707,132180776574210182836253618202132330403,"I'm going to make a big purchase, so I want to...","I'm going to make a big purchase, so I want to..."


In [22]:
user_df.head()

Unnamed: 0,sex,occupation,lpoints,user_id,nation_region,location,age
0,Female,Housewife/ Househusband,32780,673707.0,Japan,Japan,39.0
1,,,85440,876909.0,,,
2,,,62920,579569.0,,,
3,,,250,1698991.0,,,
4,,,615110,1406282.0,,,


In [23]:
# drop NANs
parl_sents_df = parl_sents_df.dropna(subset=["original", "corrected"], how="any")
print("Parallel sents: ", parl_sents_df.shape)

Parallel sents:  (76866, 4)


In [24]:
# remove duplicates and sentence shorter than 4 tokens
parl_sents_df = remove_duplicates(parl_sents_df, ["original", "corrected"])
parl_sents_df = remove_n_less_sents(parl_sents_df)
print("Parallel sents: ", parl_sents_df.shape)

Parallel sents:  (75122, 4)


In [25]:
# calculate number of tokens in the corpus(original sents)
total_tokens = sum(parl_sents_df["original"].str.split().str.len())
total_users = len(parl_sents_df["user_id"].unique())
total_docs = len(parl_sents_df["doc_id"].unique())

print(f"Corpus size: {total_tokens}")
print(f"Number of users: {total_users}")
print(f"Number of jorunals: {total_docs}")

Corpus size: 988986
Number of users: 1519
Number of jorunals: 10565


In [27]:
parl_sents_df.to_csv("./data/clean_paral_sents.csv")