In [58]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
import tldextract


nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

data_dir = "../data/2018-08-10_AV_Innoplexus/"

#After we use get_text, use nltk's clean_html function.
def nltkPipe(soup_text):
    #Convert to tokens
    tokens = [x.lower() for x in wordpunct_tokenize(soup_text)]
    text = nltk.Text(tokens)
    #Get lowercase words. No single letters, and no stop words
    words = [w.lower() for w in text if w.isalpha() and len(w) > 1 and w.lower() not in stop_words]
    #Remove prefix/suffixes to cut down on vocab
    stemmer = EnglishStemmer()
    words_nostems = [stemmer.stem(w) for w in words]
    return ', '.join(words_nostems)

def getTitleTokens(soup):
    soup_title = soup.title
    if soup_title != None:
        soup_title_text = soup.title.get_text()
        text_arr = nltkPipe(soup_title_text)
        return text_arr
    else:
        return ''
    
def getBodyTokens(soup):
    #Get the text body
    soup_para = soup.find_all('p')
    soup_para_clean = ' '.join([x.get_text() for x in soup_para if x.span==None and x.a==None])
    text_arr = nltkPipe(soup_para_clean)
    return text_arr

def getDomainTokens(domainstr):
    domain_extracted = tldextract.extract(domainstr)#.domain
    domain_tokens = nltkPipe(domain_extracted.domain+","+domain_extracted.suffix)
    return domain_tokens

def get_all_tokens(frame):
    print("Parsing domain tokens...")
    domain_tokens = frame['Domain'].apply(getDomainTokens)
    print("Parsing soup...")
    soup = frame['Html'].apply(lambda x: BeautifulSoup(x, 'html.parser'))
    print("Getting title tokens...")
    title_tokens = soup.apply(getTitleTokens)
    print("Getting body tokens...")
    body_tokens = soup.apply(getBodyTokens)
    print("Done!")
    return title_tokens + body_tokens + domain_tokens

#Build the model
def get_html(in_df, test=False):
    keep_cols = ["Webpage_id","Tag","Domain"]
    if test:
        keep_cols = ["Webpage_id","Domain"]
    use_df = in_df[keep_cols]
    html_reader_obj = pd.read_csv(data_dir+'html_data.csv',iterator=True, chunksize=10000)
    frames = []
    match_indices = use_df['Webpage_id'].values.tolist()
    print("Getting tokens...")
    print(len(match_indices),' indices left...')
    while len(match_indices) > 0:
        for chunk in html_reader_obj:
            merge_df = pd.merge(use_df,chunk,how='inner',on='Webpage_id')
            merge_df['all_tokens'] = get_all_tokens(merge_df)
            merge_df.drop(['Html','Domain'],axis=1,inplace=True)
            merge_indices = merge_df['Webpage_id'].values.tolist()
            match_indices = [x for x in match_indices if x not in merge_indices]
            print(len(match_indices),' indices left...')
            frames.append(merge_df)
    #Process HTMl for bags of words of the body and title.
    process_df = pd.concat(frames)
    print("Done!")
    return process_df

def build_model():
    """Return the estimator and the object to transform the test data."""    
    train_df = pd.read_csv(data_dir+'train.csv')
    tags = train_df['Tag']
    #Get tokens
    train_df = get_html(train_df)
    #Fit_transform to tdfif matrix
    print("Transforming to tdfif_matrix...")
    train_df = vectorizer.fit_transform(train_df['all_tokens'])
    #Prune unneeded features
    print("Performing SVD...")
    train_df = svd.fit_transform(train_df)
    
    vector_features = vectorizer.get_feature_names()
    eigen_features = [vector_features[i] for i in svd.components_[0].argsort()[::-1]][:500]

    train_df = pd.DataFrame(train_df,columns=eigen_features)
    train_df['Tag'] = tags
    
    tags = train_df['Tag'].unique().tolist()
    tags.sort()

    tag_dict = {key: value for (key, value) in zip(tags,range(len(tags)))}

    train_df['Tag_encoded'] = train_df['Tag'].map(tag_dict)
    train_df = train_df.drop('Tag',axis=1)
    #Build the model
    print("Building the model...")
    exported_pipeline = make_pipeline(
        StackingEstimator(
            estimator=ExtraTreesClassifier(
                bootstrap=False, criterion="gini", max_features=0.2, 
                min_samples_leaf=11, min_samples_split=17, n_estimators=100)
        ),
        ExtraTreesClassifier(
            bootstrap=False, criterion="entropy", max_features=0.5, 
            min_samples_leaf=6, min_samples_split=9, n_estimators=100
        )
    )
    
    x_cols = [x for x in train_df_svd.columns if x != "Tag_encoded"]
    X_train, X_test, y_train, y_test = train_test_split(
        train_df[x_cols],
        train_df['Tag_encoded'],
        test_size=0.33
    )
    print("Fitting the model...")
    exported_pipeline.fit(X_train, y_train)
    print("Done!")
    return exported_pipeline, vectorizer, svd, tag_dict

def prep_test(vectorizer_obj, svd_obj):
    """Transform test dataset for predicting."""
    print("Getting tokens from html...")
    test_df = pd.read_csv(data_dir+'test.csv')
    #Get the HTMl
    test_df_tokens = get_html(test_df)
    #Transform to tdfif matrix
    print("Transforming to tfidf matrix...")
    test_df_tdif = vectorizer_obj.transform(test_df_tokens['all_tokens'])
    #Prune unneeded features
    print("Performing SVD...")
    test_svd_array = svd_obj.transform(test_df_tdif)
    
    vector_features = vectorizer_obj.get_feature_names()
    eigen_features = [vector_features[i] for i in svd_obj.components_[0].argsort()[::-1]][:500]
    #Map to dataframe
    test_df_svd = pd.DataFrame(test_svd_array,columns=eigen_features)
    test_df_svd['Tag'] = test_df['Tag']
    print("Done!")
    return test_df_svd

def main():
    #Get the model
    print("Getting the model, transform objects and tag-dict...")
    model, vectorizer_obj, svd_obj, tag_dict = build_model()
    #Prep the test set
    print("Prepping the test dataset...")
    test_df = prep_test(vectorizer_obj, svd_obj)
    print("Making predictions...")
    predictions = model.predict(test_df)
    print("Formatting predictions...")
    print("Saving predictions for submission...")
    return predictions

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jdber\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jdber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
token_df = pd.read_csv(data_dir+'train_with_tokens_no_html.csv',usecols=["Webpage_id","Tag","title_tokens","body_tokens"])
token_df['all_tokens'] =  token_df['title_tokens']+token_df['body_tokens']
token_df['all_tokens'] = token_df['all_tokens'].str.replace('[','').str.replace(']','').replace('"','')
token_df.drop(['title_tokens','body_tokens'],axis=1,inplace=True)
token_df.sample(5)

Unnamed: 0,Webpage_id,Tag,all_tokens
16427,24663,news,"'allogen', 'transplant', 'age', 'still', 'matt..."
19326,29109,others,"'yorkshir', 'even', 'post', 'obituari', 'yorks..."
24157,36422,profile,"'bruce', 'faddegon', 'phd', 'ucsf', 'helen', '..."
25974,39212,profile,"'mit', 'nse', 'faculti', 'neil', 'todrea''kepc..."
35900,53763,others,"'exrna', 'home''advertis', 'exrna', 'accept', ..."


In [43]:
domain_df = pd.read_csv(data_dir+'train.csv',usecols=['Webpage_id','Domain'])
domain_df['domain_main'] = domain_df['Domain'].apply(lambda x: tldextract.extract(x).domain)
domain_df['domain_suffix'] = domain_df['Domain'].apply(lambda x: tldextract.extract(x).suffix)
domain_df['domain_features'] = (domain_df['domain_main']+","+domain_df['domain_suffix']).apply(nltkPipe)
domain_df.drop(['Domain','domain_main','domain_suffix'],axis=1,inplace=True)
domain_df.sample(5)

Unnamed: 0,Webpage_id,domain_features
22384,33792,"pacificcancercar, com"
16995,25409,"fitnessandpow, com"
53364,79242,"mit, edu"
10588,15603,"acr, org"
35298,52862,"bmj, com"


In [47]:
source_df = pd.merge(token_df, domain_df, how='inner', on='Webpage_id')
print(source_df.shape)
source_df['all_tokens'] = source_df['all_tokens']+', '+source_df['domain_features']
source_df.drop('domain_features',axis=1,inplace=True)
source_df.sample(5)

(53447, 4)


Unnamed: 0,Webpage_id,Tag,all_tokens
3523,5144,profile,"'dr', 'john', 'capurro', 'md', 'milford', 'oh'..."
44944,67043,profile,"'koji', 'hatsukawa', 'compani', 'inform', 'tak..."
46013,68622,clinicalTrials,"'tctr', 'thai', 'clinic', 'trial', 'registri',..."
24466,36801,profile,"'societi', 'intervent', 'radiolog', 'sir', 'do..."
30944,46147,forum,"'mysteri', 'infertilityhonesti''ten', 'minut',..."


In [48]:
vectorizer = TfidfVectorizer(input='content', analyzer='word')
svd = TruncatedSVD(n_components=500, n_iter=5, random_state=27)

In [49]:
word_vector_array = vectorizer.fit_transform(source_df['all_tokens'])

In [73]:
svd_array = svd.fit_transform(word_vector_array)

In [74]:
vector_features = vectorizer.get_feature_names()
eigen_features = [vector_features[i] for i in svd.components_[0].argsort()[::-1]][:500]

train_df = pd.DataFrame(svd_array,columns=eigen_features)
train_df['Tag'] = source_df['Tag']
    
tags = train_df['Tag'].unique().tolist()
tags.sort()

tag_dict = {key: value for (key, value) in zip(tags,range(len(tags)))}

train_df['Tag_encoded'] = train_df['Tag'].map(tag_dict)
train_df = train_df.drop('Tag',axis=1)

In [75]:
train_df.sample(10)

Unnamed: 0,patient,cancer,health,bayer,research,product,inform,use,gsk,develop,...,graduat,throughout,whether,among,total,sustain,mobil,forum,centr,Tag_encoded
45644,0.148401,-0.084348,-0.038424,-0.042225,-0.029757,0.004467,-0.028058,0.005072,-0.027872,0.028766,...,-0.015298,0.033753,-0.001467,-0.012642,-0.012301,-0.014438,-0.047051,-0.009287,-0.021176,7
50755,0.154386,-0.032234,-0.014983,-0.005945,-0.002034,-0.00536,-0.006871,-0.015065,0.037669,0.016059,...,-0.011327,-0.015481,-0.001123,-0.000761,-0.002274,0.011933,-0.004356,0.015494,-0.007348,4
6991,0.131928,-0.026124,-0.020698,0.004053,-0.01372,0.006033,-0.028934,-0.000562,0.005782,0.013431,...,-0.035397,-0.007723,-0.053035,-0.001402,-0.013196,-0.03081,0.031708,0.037635,-0.001763,3
30828,0.065833,-0.008795,-0.007658,0.0392,-0.017303,-0.008882,0.05697,0.000565,0.011923,0.045095,...,0.006847,-0.002478,0.00504,-0.007234,-0.015462,0.002496,-0.00301,-0.00649,-0.005172,2
19752,0.109169,0.016723,-0.007267,0.027117,-0.004144,-0.003396,0.038657,0.023709,0.005463,0.045329,...,-0.007441,-0.002396,-0.017509,0.009909,0.027611,0.013205,0.000551,0.020046,-0.026557,5
49379,0.055385,-0.036873,-0.026656,-0.023732,-0.009926,0.003048,-0.015163,0.021394,-0.020639,-0.009577,...,0.006707,-0.011304,-0.000261,0.005222,0.01545,-0.001336,0.006149,0.015433,0.009121,7
46275,0.055629,-0.085488,0.46031,-0.166245,0.007595,-0.034809,0.080248,0.043347,-0.253839,-0.088894,...,0.000372,-0.000194,8.9e-05,-0.000471,-0.000112,-1.6e-05,0.000375,0.000448,0.00026,0
36571,0.125965,-0.046169,-0.018657,-0.00327,-0.024569,-0.007449,0.009288,0.00852,-0.024646,0.044337,...,-0.023248,-0.021361,0.007438,-0.014241,-0.020298,-0.008134,-0.002147,0.010233,-0.006846,5
868,0.036869,-0.008067,-0.019351,0.000896,0.003995,-0.001342,0.034133,0.081261,0.010176,-0.043716,...,0.000709,0.010042,0.020556,-0.017408,-0.004907,0.007852,0.008303,-0.010102,-0.004037,4
30820,0.143354,-0.061969,-0.0642,-0.002835,-0.057456,-0.027892,0.075034,-0.038988,-0.055222,-0.003986,...,-0.043678,-0.002637,0.003034,0.038268,0.035648,-0.006908,0.004277,-0.000765,0.013599,2


In [53]:
exported_pipeline = ExtraTreesClassifier(
    bootstrap=False, criterion="entropy", max_features=0.7500000000000001,
    min_samples_leaf=2, min_samples_split=2, n_estimators=100, verbose=2,
    n_jobs=2
)

x_cols = [x for x in train_df.columns if x != "Tag_encoded"]
X_train, X_test, y_train, y_test = train_test_split(
    train_df[x_cols],
    train_df['Tag_encoded'],
    test_size=0.33
)
print("Fitting the model...")
exported_pipeline.fit(X_train, y_train)
print("Done!")

Fitting the model...
building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   56.9s


building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80

[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  2.5min finished


Done!


In [54]:
#Sanity qual check
from sklearn.metrics import confusion_matrix, f1_score
preds = exported_pipeline.predict(X_test)
conf_mat = confusion_matrix(y_test, preds)

[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.3s finished


In [55]:
conf_mat

array([[ 893,    3,    0,    0,    3,    6,    0,   28,    0],
       [   0, 1082,    2,    7,   17,  148,   19,  245,    4],
       [   0,    1, 1438,    0,   18,    9,    2,    6,    0],
       [   0,    6,    1,  374,    4,   40,    0,   18,    0],
       [   2,   23,   20,    9, 1857,  604,   15,  119,    0],
       [   3,   23,   11,   10,  209, 5431,   59,   68,    5],
       [   0,    1,    0,    0,    9,  183, 1511,   19,    1],
       [   2,   53,    0,   12,   14,  145,    5, 2274,    0],
       [   0,    0,    0,    0,    7,   12,    0,   18,  530]],
      dtype=int64)

In [109]:
model_f1 = f1_score(y_test, preds, average='weighted')
model_f1

0.8707502582796638

Ok, not bad. Lets generate the submission.

In [59]:
#Now turn this pipeline around on the test set.
test_source = pd.read_csv(data_dir+"test.csv",usecols=['Webpage_id','Domain'])
test_df = get_html(test_source, test=True)

Getting tokens...
25787  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
22618  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
19143  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
15768  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
12325  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
9169  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
5913  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
3023  indices left...
Parsing domain tokens...
Parsing soup...
Getting title tokens...
Getting body tokens...
Done!
0  indices left...
Done!


In [95]:
#test_df.to_csv(data_dir+"test_tokens.csv",index=False)
test_df.head()

Unnamed: 0,Webpage_id,all_tokens
0,31,"isrctn, develop, valid, caregiv, qualiti, life..."
1,32,"clinic, trial, registclinicaltrialsregist, eu"
2,33,"clinic, trial, registclinicaltrialsregist, eu"
3,34,"clinic, trial, registclinicaltrialsregist, eu"
4,35,"clinic, trial, registclinicaltrialsregist, eu"


In [64]:
# def listWrapper(row):
#     None

# def fillEmptyTokens(frame):
#     empty_indices = frame[frame.iloc[:]['all_tokens'].str.len()==0].index.values.tolist()
#     frame.loc[empty_indices,'all_tokens'] = [[['the']] for x in empty_indices]
#     return frame

# fillEmptyTokens(test_df)

In [66]:
test_vector_array = test_df['all_tokens'].apply(lambda x: vectorizer.transform(x.split(',')))

In [84]:
test_vector_arr = vectorizer.transform(test_df['all_tokens'])

In [87]:
test_svd_array = svd.transform(test_vector_arr)

In [94]:
test_df_predict = pd.DataFrame(test_svd_array,columns=eigen_features)
test_df_predict.head()

Unnamed: 0,patient,cancer,health,bayer,research,product,inform,use,gsk,develop,...,produc,graduat,throughout,whether,among,total,sustain,mobil,forum,centr
0,0.173647,-0.049837,0.008551,-0.012303,-0.021304,-0.006715,0.006104,-0.023268,0.028254,0.011883,...,-0.002451,-0.006524,0.037639,0.014929,-0.012422,0.011763,0.001554,-0.001192,0.015627,-0.012872
1,0.089805,-0.084448,0.308336,-0.110524,0.026413,0.092157,0.060744,-0.029284,-0.07105,-0.051477,...,-0.019758,-0.034769,-0.026266,-0.001027,0.006043,-0.016143,0.010707,0.009766,-0.053213,0.001117
2,0.089805,-0.084448,0.308336,-0.110524,0.026413,0.092157,0.060744,-0.029284,-0.07105,-0.051477,...,-0.019758,-0.034769,-0.026266,-0.001027,0.006043,-0.016143,0.010707,0.009766,-0.053213,0.001117
3,0.089805,-0.084448,0.308336,-0.110524,0.026413,0.092157,0.060744,-0.029284,-0.07105,-0.051477,...,-0.019758,-0.034769,-0.026266,-0.001027,0.006043,-0.016143,0.010707,0.009766,-0.053213,0.001117
4,0.089805,-0.084448,0.308336,-0.110524,0.026413,0.092157,0.060744,-0.029284,-0.07105,-0.051477,...,-0.019758,-0.034769,-0.026266,-0.001027,0.006043,-0.016143,0.010707,0.009766,-0.053213,0.001117


In [99]:
submission_preds = exported_pipeline.predict(test_df_predict)

[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.4s finished


In [102]:
#get the inverted tag map for mapping back
tag_dict_inverted = {v: k for k, v in tag_dict.items()}

In [108]:
submission_df = pd.DataFrame(data=submission_preds,columns=['Tag'],index=test_df_predict.index)
submission_df['Tag'] = submission_df['Tag'].map(tag_dict_inverted)
submission_df['Webpage_id'] = test_df['Webpage_id'].values.tolist()
col_order = ["Webpage_id","Tag"]
submission_df = submission_df[col_order]
submission_df.to_csv('submission_01.csv',index=False)
submission_df.head()

Unnamed: 0,Webpage_id,Tag
0,31,clinicalTrials
1,32,others
2,33,others
3,34,others
4,35,others
