# Things to do to improve performance.

So the first submission didn't score as well as the train set, not surprising!

Lets try and;

1. Extract more of the text from the html, the body and title aren't enough it seems.
2. Over/undersample and see if I improve performance with either strategy.

In [1]:
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import nltk
from nltk import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
import tldextract


nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

data_dir = "../data/2018-08-10_AV_Innoplexus/"

#After we use get_text, use nltk's clean_html function.
def nltkPipe(soup_text):
    #Convert to tokens
    tokens = [x.lower() for x in wordpunct_tokenize(soup_text)]
    text = nltk.Text(tokens)
    #Get lowercase words. No single letters, and no stop words
    words = [w.lower() for w in text if w.isalpha() and len(w) > 1 and w.lower() not in stop_words]
    #Remove prefix/suffixes to cut down on vocab
    stemmer = EnglishStemmer()
    words_nostems = [stemmer.stem(w) for w in words]
    return ', '.join(words_nostems)

def getTitleTokens(soup):
    soup_title = soup.title
    if soup_title != None:
        soup_title_text = soup.title.get_text()
        text_arr = nltkPipe(soup_title_text)
        return text_arr
    else:
        return ''
    
def getBodyTokens(soup):
    #Get the text body
    soup_para = soup.find_all('p')
    if soup_para != None:
        soup_para_clean = ' '.join([x.get_text() for x in soup_para if x.span==None and x.a==None])
        text_arr = nltkPipe(soup_para_clean)
        return text_arr
    else:
        return ''

def getDomainTokens(domainstr):
    domain_extracted = tldextract.extract(domainstr)#.domain
    domain_tokens = nltkPipe(domain_extracted.domain+","+domain_extracted.suffix)
    return domain_tokens

def getUrlTokens(url):
    domain_split = url.rsplit('/')
    if len(domain_split) > 1:
        domain_split_elements = ' '.join(domain_split[1:])
        domain_split_tokens = nltkPipe(domain_split_elements)
        return domain_split_tokens
    else:
        return ''

def getDescriptionTokens(soup):
    #Get the text body
    soup_desc = soup.find_all('dl')
    if soup_desc != None:
        soup_desc_clean = ' '.join([x.get_text() for x in soup_desc])
        text_arr = nltkPipe(soup_desc_clean)
        return text_arr
    else:
        return ''

def getHeaderTokens(soup):
    #Get the html header tokens
    soup_heads = soup.find_all('header')
    if soup_heads != None:
        soup_heads_clean = ' '.join([x.get_text() for x in soup_heads])
        text_arr = nltkPipe(soup_heads_clean)
        return text_arr
    else:
        return ''
    
def getHeadTokens(soup):
    #Get the html head tokens
    soup_head = soup.find_all('head')
    if soup_head != None:
        soup_head_clean = ' '.join([x.get_text() for x in soup_head])
        text_arr = nltkPipe(soup_head_clean)
        return text_arr
    else:
        return ''
    
def getFontTokens(soup):
    soup_font = soup.find_all('font')
    if soup_font != None:
        soup_font_clean = ' '.join([x.get_text() for x in soup_font])
        text_arr = nltkPipe(soup_font_clean)
        return text_arr
    else:
        return ''

def getTableTokens(soup):
    soup_table = soup.find_all('table')
    soup_table_headers = [[a for a in x.find_all('th')] for x in soup_table]
    soup_table_cells = [[a for a in x.find_all('td')] for x in soup_table]
    if soup_table != None:
        soup_table_headers_clean = ' '.join([' '.join([a.get_text() for a in x]) for x in soup_table_headers])
        soup_table_cells_clean = ' '.join([' '.join([a.get_text() for a in x]) for x in soup_table_cells])
        text_arr = nltkPipe(soup_table_headers_clean+soup_table_cells_clean)
        return text_arr
    else:
        return ''
    
def getHrefTokens(soup):
    soup_href = soup.find_all('href')
    if soup_href != None:
        soup_href_clean = ' '.join([x.get_text() for x in soup_href])
        text_arr = nltkPipe(soup_href_clean)
        return text_arr
    else:
        return ''
    
def getListTokens(soup):
    soup_list = soup.find_all('ol') + soup.find_all('ul')
    if soup_list != None:
        soup_list_items = [x.find_all('li') for x in soup_list]
        soup_list_items_clean = ' '.join([' '.join([a.get_text() for a in x]) for x in soup_list_items])
        text_arr = nltkPipe(soup_list_items_clean)
        return text_arr
    else:
        return ''

def get_all_tokens(frame):
    print("Parsing domain tokens...")
    domain_tokens = frame['Domain'].apply(getDomainTokens)
    print("Parsing url tokens...")
    url_tokens = frame['Url'].apply(getUrlTokens)
    print("Parsing soup...")
    soup = frame['Html'].apply(lambda x: BeautifulSoup(x, 'html.parser'))
    print("Getting title tokens...")
    title_tokens = soup.apply(getTitleTokens)
    print("Getting body tokens...")
    body_tokens = soup.apply(getBodyTokens)
    print("Getting description tokens...")
    description_tokens = soup.apply(getDescriptionTokens)
    print("Getting header tokens...")
    header_tokens = soup.apply(getHeaderTokens)
    print("Getting head-metadata tokens...")
    head_tokens = soup.apply(getHeadTokens)
    print("Getting font tokens...")
    font_tokens = soup.apply(getFontTokens)
    print("Getting table tokens...")
    table_tokens = soup.apply(getTableTokens)
    print("Getting href tokens...")
    href_tokens = soup.apply(getHrefTokens)
    print("Getting list tokens...")
    list_tokens = soup.apply(getListTokens)
    print("Done!")
    return title_tokens + body_tokens + domain_tokens + url_tokens\
    + description_tokens + header_tokens + head_tokens + font_tokens +\
    table_tokens + href_tokens + list_tokens

#Build the model
def get_html(in_df, out_file_name, chunk_size=5000, overwrite=False, test=False):
    keep_cols = ["Webpage_id","Domain","Url","Tag"]
    read_cols = ["Webpage_id","all_tokens","Tag"]
    if test:
        keep_cols = ["Webpage_id","Domain","Url"]
        read_cols = ["Webpage_id","all_tokens"]
    if os.path.isfile(data_dir+out_file_name)==False:
        if test:
            out_frame = pd.DataFrame(columns=["Webpage_id","all_tokens"])
        else:
            out_frame = pd.DataFrame(columns=["Webpage_id","Tag","all_tokens"])
        out_frame.to_csv(data_dir+out_file_name,index=False)
    else:
        if overwrite:
            if test:
                out_frame = pd.DataFrame(columns=["Webpage_id","all_tokens"])
            else:
                out_frame = pd.DataFrame(columns=["Webpage_id","Tag","all_tokens"])
            out_frame.to_csv(data_dir+out_file_name,index=False)
    use_df = in_df[keep_cols]
    html_reader_obj = pd.read_csv(data_dir+'html_data.csv',iterator=True, chunksize=chunk_size)
    match_indices = use_df['Webpage_id'].values.tolist()
    print("Getting tokens...")
    print(len(match_indices),' indices left...')
    while len(match_indices) > 0:
        for chunk in html_reader_obj:
            merge_df = pd.merge(use_df,chunk,how='inner',on='Webpage_id')
            merge_df['all_tokens'] = get_all_tokens(merge_df)
            merge_df.drop(['Html','Domain','Url'],axis=1,inplace=True)
            merge_indices = merge_df['Webpage_id'].values.tolist()
            match_indices = [x for x in match_indices if x not in merge_indices]
            print(len(match_indices),' indices left...')
            concat_frame = pd.read_csv(data_dir+out_file_name,usecols=read_cols)
            return_frame = concat_frame.append(merge_df)[read_cols]
            return_frame.to_csv(data_dir+out_file_name,index=False)
            #frames.append(merge_df)
    #Process HTMl for bags of words of the body and title.
    #process_df = pd.concat(frames)
    print("Done! You can get your file at\n"+data_dir+out_file_name)

def build_model():
    """Return the estimator and the object to transform the test data."""    
    train_df = pd.read_csv(data_dir+'train.csv')
    tags = train_df['Tag']
    #Get tokens
    train_df = get_html(train_df)
    #Fit_transform to tdfif matrix
    print("Transforming to tdfif_matrix...")
    train_df = vectorizer.fit_transform(train_df['all_tokens'])
    #Prune unneeded features
    print("Performing SVD...")
    train_df = svd.fit_transform(train_df)
    
    vector_features = vectorizer.get_feature_names()
    eigen_features = [vector_features[i] for i in svd.components_[0].argsort()[::-1]][:500]

    train_df = pd.DataFrame(train_df,columns=eigen_features)
    train_df['Tag'] = tags
    
    tags = train_df['Tag'].unique().tolist()
    tags.sort()

    tag_dict = {key: value for (key, value) in zip(tags,range(len(tags)))}

    train_df['Tag_encoded'] = train_df['Tag'].map(tag_dict)
    train_df = train_df.drop('Tag',axis=1)
    #Build the model
    print("Building the model...")
    exported_pipeline = make_pipeline(
        StackingEstimator(
            estimator=ExtraTreesClassifier(
                bootstrap=False, criterion="gini", max_features=0.2, 
                min_samples_leaf=11, min_samples_split=17, n_estimators=100)
        ),
        ExtraTreesClassifier(
            bootstrap=False, criterion="entropy", max_features=0.5, 
            min_samples_leaf=6, min_samples_split=9, n_estimators=100
        )
    )
    
    x_cols = [x for x in train_df_svd.columns if x != "Tag_encoded"]
    X_train, X_test, y_train, y_test = train_test_split(
        train_df[x_cols],
        train_df['Tag_encoded'],
        test_size=0.33
    )
    print("Fitting the model...")
    exported_pipeline.fit(X_train, y_train)
    print("Done!")
    return exported_pipeline, vectorizer, svd, tag_dict

def prep_test(vectorizer_obj, svd_obj):
    """Transform test dataset for predicting."""
    print("Getting tokens from html...")
    test_df = pd.read_csv(data_dir+'test.csv')
    #Get the HTMl
    test_df_tokens = get_html(test_df)
    #Transform to tdfif matrix
    print("Transforming to tfidf matrix...")
    test_df_tdif = vectorizer_obj.transform(test_df_tokens['all_tokens'])
    #Prune unneeded features
    print("Performing SVD...")
    test_svd_array = svd_obj.transform(test_df_tdif)
    
    vector_features = vectorizer_obj.get_feature_names()
    eigen_features = [vector_features[i] for i in svd_obj.components_[0].argsort()[::-1]][:500]
    #Map to dataframe
    test_df_svd = pd.DataFrame(test_svd_array,columns=eigen_features)
    test_df_svd['Tag'] = test_df['Tag']
    print("Done!")
    return test_df_svd

def main():
    #Get the model
    print("Getting the model, transform objects and tag-dict...")
    model, vectorizer_obj, svd_obj, tag_dict = build_model()
    #Prep the test set
    print("Prepping the test dataset...")
    test_df = prep_test(vectorizer_obj, svd_obj)
    print("Making predictions...")
    predictions = model.predict(test_df)
    print("Formatting predictions...")
    print("Saving predictions for submission...")
    return predictions

  from numpy.core.umath_tests import inner1d


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jake\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jake\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Lets try and extract more tokens. I want to extract tokens from; description, headings, highlights, special fonts, table and list elements.

To do this I'll write an html reading script for a subset of rows to test my parser on. Then I'll incorporate into the get_html function and make sure I get all tokens to train on.

In [2]:
train_df = pd.read_csv(data_dir+'train.csv')
train_df.sample(5)

Unnamed: 0,Webpage_id,Domain,Url,Tag
47212,70113,www.medbox.org,https://www.medbox.org/clinical-guidelines/mal...,guidelines
5860,8570,www.dart-europe.eu,http://www.dart-europe.eu/full.php?id=110600,thesis
30825,46028,www.ivfforums.gr,http://www.ivfforums.gr/ivf-moms-magazine/cook...,forum
33088,49622,www.childrenshospital.org,http://www.childrenshospital.org/news-and-even...,news
48957,72270,curate.nd.edu,https://curate.nd.edu/show/1c18df67n1c,thesis


In [3]:
test_df = pd.read_csv(data_dir+'test.csv')
test_df.sample(5)

Unnamed: 0,Webpage_id,Domain,Url
5012,15530,tlcr.amegroups.com,http://tlcr.amegroups.com/article/view/3192/3746
19299,58628,go.qiagen.com,https://go.qiagen.com/ngs-assay-menu?elq=00000...
14266,43055,www.mhi.interv.org,http://www.mhi.interv.org/
7810,23543,cshprotocols.cshlp.org,http://cshprotocols.cshlp.org/content/2014/3/p...
4420,13985,www.mims.co.uk,http://www.mims.co.uk/risk-abuse-gabapentin-pr...


In [5]:
get_html(train_df,'train_df_all_tokens.csv')

Getting tokens...
53447  indices left...
Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
50065  indices left...
Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
46616  indices left...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
43146  indices left...
Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
40091  indices left...
Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
36762  indices left...
Parsing domain tokens...
Parsing url tokens

In [6]:
get_html(test_df,'test_df_all_tokens.csv', test=True)

Getting tokens...
25787  indices left...
Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
24169  indices left...
Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
22618  indices left...
Parsing domain tokens...
Parsing url tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Getting description tokens...
Getting header tokens...
Getting head-metadata tokens...
Getting font tokens...
Getting table tokens...
Getting href tokens...
Getting list tokens...
Done!
21088  indices left...
Pa