In [16]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from sklearn.cross_validation import train_test_split
import re

data_loc = '/Users/rbekbolatov/data/kaggle/native/'

In [17]:
N = 200

In [18]:
dirs = {}

for i in range(5):
    files = pd.read_csv(data_loc + 'files_in/files_in_' + str(i), header=None, names=['filename'])
    files['dir'] = i
    dirs[i] = set(files['filename'])
    
def get_dir(filename):
    for i in range(5):
        if filename in dirs[i]:
            return i
    return -1  

eval_labels = pd.read_csv(data_loc + 'train.csv')
eval_labels_sample = eval_labels.sample(N)

train, test = train_test_split(eval_labels_sample, test_size = 0.5)

In [19]:
all_files = pd.read_csv(data_loc + 'all_filenames.csv', header=None, names=['file', 'size'])

In [20]:
def generate_subsamples(all_files):
    files_8k = all_files.sample(8000, random_state = 101)
    files_3k = all_files.sample(3000, random_state = 101)
    files_100 = all_files.sample(100, random_state = 101)
    # save files
    subsamples_loc = data_loc + 'subsamples/'
    files_8k['file'].to_csv(subsamples_loc + 'files8k.csv', index=False)
    files_3k['file'].to_csv(subsamples_loc + 'files3k.csv', index=False)
    files_100['file'].to_csv(subsamples_loc + 'files100.csv', index=False)
    # have filenames here
    files_8k = np.asarray(files_8k['file'])
    files_3k = np.asarray(files_3k['file'])
    files_100 = np.asarray(files_100['file'])

In [59]:
def get_soup(filename):
    file_handle = open(data_loc + str(get_dir(filename)) + '/' + filename)
    file_content = file_handle.read()
    file_handle.close()
    soup = bs(file_content)
    return soup

def get_paragraphs(soup):
    paragraphs = soup.findAll('p')
    cleaned_texts = [re.sub(r'[\'"|\n\t,.:;()\-\/]+', ' ', p.text.encode('ascii', 'ignore').strip()) for p in paragraphs]
    return cleaned_texts 

def get_title(soup):
    title = soup.find('title')
    title = '' if not title else title.text
    title = 'long_title' if len(title) > 200 else title.encode('ascii', 'ignore').strip().replace('\n',' ')
    return title

def get_links(soup):
    hrefs = []
    texts = []
    links = soup.findAll('a')
    hrefs = [a.href for a in links]
    texts = [re.sub(r'[\'"|\n\t,.:;()\-\/]+', ' ', a.text.encode('ascii', 'ignore').strip()) for a in links]
    return hrefs, texts
    
def get_tag_data(files):
    data = []
    for filename in files:
        with open(data_loc + str(get_dir(filename)) + '/' + filename, 'r') as f:
            file_content = f.read()
            #print (file_content)
            if file_content:
                soup = bs(file_content, 'lxml') #, 'html.parser')
                title = get_title(soup)
                link_hrefs, link_texts = get_links(soup)
                data.append((filename, title, len(link_hrefs)))

    print 'Data size: %d' % len(data)
    data = pd.DataFrame(data, columns = ['file', 'title', 'num_a'])
    return data



In [24]:
data_train = get_tag_data(train['file'])
data_test = get_tag_data(test['file'])

Data size: 100
Data size: 100


In [25]:
data_train = train.merge(data_train)
data_test = test.merge(data_test)

In [26]:
data_train[10:15]

Unnamed: 0,file,sponsored,title,num_a
10,2294986_raw_html.txt,0,Penny Arcade - Comic - My Comeuppance,94
11,3797287_raw_html.txt,0,"Comedian Interviews, Funny Articles, Jokes & M...",92
12,3953071_raw_html.txt,0,Rockout with Rocksmith 2014 Edition - I Wanna ...,126
13,2864376_raw_html.txt,0,Easy recipe: Chocolate cream,145
14,2905436_raw_html.txt,0,Creative Compulsive Disorder: Remembering Zina...,96


In [32]:
eval_labels = pd.read_csv(data_loc + 'train.csv')
lb_labels = pd.read_csv(data_loc + 'sampleSubmission.csv')

In [39]:
#filenames = ['1767762_raw_html.txt', '1542621_raw_html.txt', '625398_raw_html.txt', '1554226_raw_html.txt']
filenames = ['625398_raw_html.txt', '1554226_raw_html.txt']
soups = [(filename, get_soup(filename)) for filename in filenames]

In [40]:
titles = [soup.find('title').text.encode('ascii', 'ignore').strip() for filename, soup in soups]
tags = ['a', 'p', 'div', 'script', 'img', 'ul', 'ol', 'hr', 'b', 'i']
tag_data = {filename: {tag: soup.findAll(tag) for tag in tags} for filename, soup in soups}

In [41]:
[(filename, {tag: len(file_tags[tag]) for tag in tags}) for filename, file_tags in tag_data.iteritems()]

[('625398_raw_html.txt',
  {'a': 20,
   'b': 0,
   'div': 40,
   'hr': 0,
   'i': 0,
   'img': 14,
   'ol': 0,
   'p': 40,
   'script': 11,
   'ul': 2}),
 ('1554226_raw_html.txt',
  {'a': 358,
   'b': 5,
   'div': 213,
   'hr': 0,
   'i': 0,
   'img': 209,
   'ol': 0,
   'p': 28,
   'script': 95,
   'ul': 32})]

In [60]:
soup = get_soup('1542621_raw_html.txt') # '1767762_raw_html.txt')
texts = get_paragraphs(soup)
ts, rs = get_links(soup)
texts,rs,ts

(['',
  'We are surrounded by a society which seems to enjoy inflicting us with their scary birth stories ',
  '',
  'I have created this part of my website to give you a place to read POSITIVE  encouraging  uplifting birth stories  The majority of the stories are of moms using hypnosis  most of them using Hypnobabies during their births  If you enjoy birth stories  sign up for my newsletter  many will include a positive birth story!',
  '',
  '',
  'Proudly powered by WordPress                                              WordPress Theme Custom Community 2                      developed by ThemeKraft'],
 ['Pregnancy Birth and Babies',
  'Welcome',
  'Big Baby Bull',
  'Hypnosis for Birth',
  'What is Hypnosis for Birth?',
  'Comparison of Hypnobabies and HypnoBirthing',
  'What are my Options?',
  'Hypnosis for Pregnancy',
  'Epidural or Hypnobabies or Both?',
  'Calmer Baby?',
  'Birth Videos',
  'Positive Birth Stories',
  'VBAC Support',
  'Essential Oils',
  'Free Book',
  '',
  '