In [None]:
import pickle
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm

import pandas as pd
from pandas.parser import CParserError
import spacy
from spacy_cld import LanguageDetector


In [None]:
nlp = spacy.load('en_core_web_sm')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

In [None]:
# Util function to write lazy chunks back to disk
output_columns = ["id", "hash1", "hash2", "user", "user_pic", "channel_url", "channel_id", "comment", 
                  "depth", "timestamp", "language"]

comments_path = "data/comments_right_language.csv"
def write_to_disk(chunk):
    with open(comments_path, "a", encoding="utf-8") as file:
        chunk.to_csv(file, index=False, header=False)

def write_to_error(chunk):
    with open("data/comments_language_broken.csv", "a", encoding="utf-8") as file:
        chunk.to_csv(file, index=False, header=False)

# Lazy data reader into DataFrame
comments_reader = pd.read_csv("data/comments_right.csv", chunksize=500, names=output_columns[:-1])
# Reader to pick up where we ended
completed_reader = pd.read_csv("data/comments_right_language.csv", chunksize=500, names=output_columns)
#fixed_reader = pd.read_csv("data/comments_language_fix.csv", chunksize=500, names=output_columns)
#broken_reader = pd.read_csv("data/comments_language_broken.csv", chunksize=500, names=output_columns)

In [None]:
# Run this to move the transcripts_reader iterator to where we stopped last time
for _ in tqdm(completed_reader):
    next(comments_reader)

In [None]:
# Does the actual work. Detects languages and writes it to new column on disk
for ix, comments in enumerate(tqdm(comments_reader)):
    languages = []
    for ix, comment in comments.iterrows():
        content = str(comment[7])
        doc = nlp(content)
        try:
            language = doc._.languages[0]
        except IndexError:
            language = "?"
        languages.append(language)
    comments["language"] = languages
    write_to_disk(comments)

In [None]:
# Iterates over completed and tries to find errors, writing them to the error file
try:
    for ix, chunk in enumerate(tqdm(completed_reader)):
        next(comments_reader)
except CParserError:
    print("Error at {}".format(ix))
    write_to_error(next(comments_reader))