In [1]:
import pandas as pd
import numpy as np
import re
import bs4
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [2]:
df = pd.read_csv('https://s3.amazonaws.com/far-right/fourchan/chan_example.csv', parse_dates=['created_at'])

In [3]:
df.columns

Index(['parent', 'chan_id', 'created_at', 'com'], dtype='object')

In [4]:
df.shape

(10000, 4)

In [5]:
print(df.isnull().sum())
df.fillna('', inplace=True) # 204 NA com
df.com = df.com.str.replace("<wbr>", '')
df.com = df.com.str.replace("<br/?>", '\n')

parent          0
chan_id         0
created_at      0
com           204
dtype: int64


In [6]:
df['soup'] = df.com.apply(lambda x: BeautifulSoup(x, 'lxml')) ## time constraint


In [7]:
def parse(soup):
    if not soup.text:
        return {
            'quotelinks': [],
            'deadlinks': [],
            'quotes': [],
            'text': '',
            'urls': []
        }
    
    quote_links = []
    dead_links = []
    quotes = []
    text = []
    urls = []
    
    url_pat = re.compile(r'((https?://.*?)(\s|$)|(www\..*?)(\s|$))') 
    # extremely barebones http/www link
    for tag in soup.body.contents:

        if isinstance(tag, str):
            text.append(tag)
            continue

        if tag.name == 'p':
            text.append(tag.text)
            
        elif tag.name == 'span':
            class_name = tag.get('class', [])
            #print(class_name)
            if 'quote' in class_name:
                quotes.append(tag.text.strip('>'))
            elif "deadlink" in class_name:
                dead_links.append(tag.text.strip('>'))
                
        elif tag.name == 'a':
            class_name = tag.get("class", [])
            #print(class_name)
            href = tag.get('href')
            if "quotelink" in class_name:
                quote_links.append(href.strip("#p"))
            else:
                urls.append(href)
                text.append(tag.text)
                
    item = '\n'.join(filter(None, map(str.strip, text)))

    for url_list in url_pat.findall(item):
        for url in map(str.strip, (url_list[1], url_list[3])):
            if url:
                urls.append(url)
                item = item.replace(url, '')
        
    return {
        'quotelinks': quote_links,
        'deadlinks': dead_links,
        'quotes': quotes,
        'text': re.sub(r'\n{2,}', '\n', item),
        'urls': urls
    }

In [8]:
df['parsed'] = df.soup.apply(parse)

In [9]:
out = []
for i, item in enumerate(df.soup):
    try:
        parsed = parse(item)
    except Exception as e:
        print(i, e)
    else:
        out.append(parsed)

In [10]:
def timer(f, *args, **kwargs):
    import time
    start = time.time()
    res = f(*args, **kwargs)
    end = time.time()
    print(end - start)
    return res

In [11]:
def looped():
    out = []
    for i, item in enumerate(df.soup):
        try:
            parsed = parse(item)
        except Exception as e:
            print(i, e)
        else:
            out.append(parsed)
    return out

In [12]:
timer(looped)

0.6434569358825684


[{'deadlinks': [],
  'quotelinks': [],
  'quotes': [],
  'text': 'He is way ahead of his time.',
  'urls': []},
 {'deadlinks': [],
  'quotelinks': [],
  'quotes': [],
  'text': 'Mark is germoney',
  'urls': []},
 {'deadlinks': [],
  'quotelinks': ['114901981'],
  'quotes': [],
  'text': 'Welcome to the nanny state',
  'urls': []},
 {'deadlinks': [],
  'quotelinks': [],
  'quotes': [],
  'text': 'Why not?',
  'urls': []},
 {'deadlinks': [], 'quotelinks': [], 'quotes': [], 'text': 'Bump', 'urls': []},
 {'deadlinks': [],
  'quotelinks': ['114882241'],
  'quotes': [],
  'text': 'I hope someone murders him',
  'urls': []},
 {'deadlinks': [],
  'quotelinks': [],
  'quotes': [],
  'text': 'wait up',
  'urls': []},
 {'deadlinks': [],
  'quotelinks': [],
  'quotes': [],
  'text': 'Zoloft is shit, I took it for a year and it dindu nothin',
  'urls': []},
 {'deadlinks': [],
  'quotelinks': [],
  'quotes': [],
  'text': 'jews allowed too?',
  'urls': []},
 {'deadlinks': [],
  'quotelinks': [],
  '

In [13]:
timer(df.soup.apply, parse)

0.8260865211486816


0       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
1       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
2       {'quotelinks': ['114901981'], 'deadlinks': [],...
3       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
4       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
5       {'quotelinks': ['114882241'], 'deadlinks': [],...
6       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
7       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
8       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
9       {'quotelinks': [], 'deadlinks': [], 'quotes': ...
10      {'quotelinks': ['114882524'], 'deadlinks': [],...
11      {'quotelinks': ['114882241'], 'deadlinks': [],...
12      {'quotelinks': ['114878658'], 'deadlinks': [],...
13      {'quotelinks': ['114886921'], 'deadlinks': [],...
14      {'quotelinks': [], 'deadlinks': [], 'quotes': ...
15      {'quotelinks': ['114878658'], 'deadlinks': [],...
16      {'quotelinks': [], 'deadlinks': [], 'quotes': ...
17      {'quot

In [14]:
df.parent.unique().shape

(89,)

In [15]:
df.chan_id.unique().shape

(10000,)

In [16]:
df.parsed.apply(pd.Series)

Unnamed: 0,deadlinks,quotelinks,quotes,text,urls
0,[],[],[],He is way ahead of his time.,[]
1,[],[],[],Mark is germoney,[]
2,[],[114901981],[],Welcome to the nanny state,[]
3,[],[],[],Why not?,[]
4,[],[],[],Bump,[]
5,[],[114882241],[],I hope someone murders him,[]
6,[],[],[],wait up,[]
7,[],[],[],"Zoloft is shit, I took it for a year and it di...",[]
8,[],[],[],jews allowed too?,[]
9,[],[],[],Reminder: Treason is punishable by execution.,[]
