# Data Preprocessing

In [17]:
%matplotlib inline
import numpy as np
import pandas as pd
import dask.dataframe as dd
from bs4 import BeautifulSoup # For Scraping HTML page
from bs4.element import Comment
import re
import tldextract

In [57]:
from sqlalchemy import create_engine # sqlalchemy = ORM library
from sqlalchemy_utils.functions import database_exists, drop_database
import os
cwd = os.getcwd() # current working  directory

# Create an engine that stores data in the local directory's
# sqlalchemy_example.db file.
db_engine = create_engine('sqlite:///avdb.db', encoding='utf-8')
# db_path = os.path.realpath( os.path.dirname(str(engine.url)) )
url = db_engine.url
print(url)
print(database_exists(url))
# drop_database(url)
# print(database_exists(url))

sqlite:///avdb.db
True


In [3]:
train_data = 'bigdata/train.csv'
html_data = 'bigdata/train/html_data.csv'

In [4]:
df = pd.read_csv(train_data, index_col='Webpage_id')
df.head()

Unnamed: 0_level_0,Domain,Url,Tag
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfider...,news
2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipp...,news
3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exe...,news
4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-bi...,news
5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-...,news


In [5]:
df.isnull().sum() # No null values, for good..Sanity check ok!

Domain    0
Url       0
Tag       0
dtype: int64

In [6]:
# df.drop(columns=['Url'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53447 entries, 1 to 79345
Data columns (total 3 columns):
Domain    53447 non-null object
Url       53447 non-null object
Tag       53447 non-null object
dtypes: object(3)
memory usage: 1.6+ MB


In [7]:
# Convert columns to CATEGORY type to reduce memory footprint
df.Domain =  df.Domain.astype('category')
df.Tag = df.Tag.astype('category')
df.info() # Size reduced from 1.6+ to 1.2+

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53447 entries, 1 to 79345
Data columns (total 3 columns):
Domain    53447 non-null category
Url       53447 non-null object
Tag       53447 non-null category
dtypes: category(2), object(1)
memory usage: 1.2+ MB


In [8]:
# What are the different Target Classes, we have?

print(sorted(df.Tag.unique()))
# ['clinicalTrials', 'conferences', 'forum', 'guidelines', 'news', 'others', 'profile', 'publication', 'thesis']

df.Tag.value_counts()

['clinicalTrials', 'conferences', 'forum', 'guidelines', 'news', 'others', 'profile', 'publication', 'thesis']


others            17417
news               7992
publication        7705
profile            5196
conferences        4666
forum              4503
clinicalTrials     2839
thesis             1800
guidelines         1329
Name: Tag, dtype: int64

In [9]:
df.Domain.value_counts()

thesis.library.caltech.edu                301
ecommons.cornell.edu                      300
dspace.mit.edu                            300
curate.nd.edu                             300
academiccommons.columbia.edu              300
www.dart-europe.eu                        300
www.nice.org.uk                           230
www.ncbi.nlm.nih.gov                      226
www.australiancancertrials.gov.au         209
ctri.nic.in                               209
rctportal.niph.go.jp                      206
slctr.lk                                  203
en.search.irct.ir                         201
www.anzctr.org.au                         200
clinicaltrials.gov                        200
www.isrctn.com                            200
www.medbox.org                            200
upload.umin.ac.jp                         200
cris.nih.go.kr                            200
www.chictr.org.cn                         200
www.clinicalguidelines.gov.au             200
www.trialregister.nl              

In [11]:
import re
# df.Domain.apply(lambda x: x.replace('www[0-9].','')).value_counts().sort_index()
# df.Domain.apply(lambda x: re.sub(r'ww[0-9A-Za-z]*.','',x)) \
#          .apply(lambda x: re.sub(r'201[0-9]{1,}.','',x)) \
#          .value_counts().sort_index()

In [12]:
# Get just the domain from URLs
import tldextract

def extract_domain(url):
    return tldextract.extract(url).domain

df.Domain = df.Domain.apply(extract_domain)
df.Domain.value_counts() #.sort_index()

biomedcentral                  3568
bayer                          1675
mit                            1663
gsk                            1081
bmj                             776
nih                             645
clinicaltrials                  600
conferenceseries                546
sagepub                         448
blogspot                        386
amgen                           383
aacrjournals                    375
abbvie                          352
dana-farber                     352
emdgroup                        322
caltech                         322
cornell                         304
nd                              300
columbia                        300
dart-europe                     300
qiagen                          297
mskcc                           295
astrazeneca                     279
pharmaceuticalconferences       262
physiology                      259
boehringer-ingelheim            254
biogen                          238
alexion                     

In [13]:
# Sanity check for null values
df.Domain.isnull().sum()

0

In [14]:
# df[df.Domain.str.match('chictr')]
df[df.Domain.str.contains("health")]['Domain'].unique()

array(['uofmhealth', 'sidneyhealth', 'sovhealth', 'healthcare4ppl',
       'healthgrades', 'wkhealth', 'health', 'monashhealth', 'sahealth',
       'globalhealthaction', 'longevityandhealthspan',
       'microbecolhealthdis', 'besthealthmag', 'emedicinehealth',
       'everydayhealth', 'healthable', 'healthcareitnews',
       'healthnutnews', 'healthyfoodteam', 'healthywomen',
       'healthy-holistic-living', 'naturalhealth365', 'wakehealth',
       '1millionhealthworkers', 'bulletinhealthcare',
       'animalhealthireland', 'blissfullyhealthy', 'worldlunghealth',
       'healthimpactnews', 'thehealthwell', 'samhealth', 'browardhealth',
       'dukehealth', 'orlandohealth', 'emoryhealthcare', 'unchealthcare',
       'uhealthsystem', 'tophealthnews', 'uchealth', 'myhealthtalent',
       'irishhealth', 'ucsfhealth', 'consumerhealthdigest',
       'healthaffairs', 'onlymyhealth', 'spine-health',
       'thehealthyhomeeconomist', 'created4health', 'ghanahealthservice',
       'healthcommc

In [15]:
# Sample data from HTML DataFrame
hdf = pd.read_csv('bigdata/train/html_data.csv', nrows=3) # Read just 10 rows
hdf

Unnamed: 0,Webpage_id,Html
0,1,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
1,2,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
2,3,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."


In [12]:
''' For Testing Purpopse Only
# Read html_data.csv using pandas in chunks.
# The chunksize parameter refers to the number of rows per chunk.
row_count = 100
txtFileReader = pd.read_csv('bigdata/train/html_data.csv', chunksize=row_count, iterator=True, index_col='Webpage_id')
ddf = txtFileReader.get_chunk()
'''

" For Testing Purpopse Only\n# Read html_data.csv using pandas in chunks.\n# The chunksize parameter refers to the number of rows per chunk.\nrow_count = 100\ntxtFileReader = pd.read_csv('bigdata/train/html_data.csv', chunksize=row_count, iterator=True, index_col='Webpage_id')\nddf = txtFileReader.get_chunk()\n"

In [16]:
df.iloc[1]

Domain                                         fiercepharma
Url       http://www.fiercepharma.com/pharma/novo-equipp...
Tag                                                    news
Name: 2, dtype: object

In [18]:
# Objective : Extract text from title tag of HTML source of web-page
def extract_title(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser')
    title_tag = soup.find('title')
    if (title_tag == None):
        title = None
    else:
        title = title_tag.text.strip()
    return title

# Test method definition
page = hdf.head(1)['Html'].values[0]
print(extract_title(page))
print(extract_title("<html></html>"))

Tecfidera, Gilenya and Aubagio's 3-way battle for MS share is about to heat up | FiercePharma
None


In [66]:
# Objective : Extract text from HTML source of web-page
def extract_body(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser', from_encoding="utf-8")
    body_tag = soup.find('body')
    if (body_tag == None):
        body = page 
    else:
        body = body_tag # What should be returned here? How to stringify this for further  procecssing?
    return body

def is_visible_content(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def remove_extra_spaces(str):
    return u" ".join(str.split())

def extract_text(page):
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser') #, from_encoding="utf-8"
    texts = soup.findAll(text=True) # Extracts text from all HTML Markups, incl nested ones
    visible_texts = filter(is_visible_content, texts)
    # The u-prefix u" ".join() indicates Unicode and has been in python since v2.0
    # Ref. Read: https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/
    text = u" ".join(remove_extra_spaces(t.strip()) for t in visible_texts)
    text = text.replace(',','')
    text = text.replace('|','')
    text = re.sub(r'\s\s+',' ',text).strip()
    return text.encode('utf-8',errors='ignore').decode('utf-8').strip()

text = extract_text(hdf.head(1)['Html'].values[0])
# extract_text( extract_body(hdf.head(1)['Html'].values[0]) )
text

'Skip to main content Twitter LinkedIn Search Top Menu DDF 2017 FierceBiotech Jobs Resources Events Subscribe Main navigation Pharma M&A Regulatory Financials Corporate Legal Manufacturing M&A Outsourcing Regulatory Supply Chain Partnering Drug Safety Marketing Regulatory DTC Advertising Digital and Social Media Data and Analytics Launches Pharma Asia M&A R&D Regulatory Sales and Marketing Financials Manufacturing Animal Health R&D M&A Regulatory Veterinarian Financials Vaccines Drug Delivery R&D Regulatory Partnering Vaccines Deals Infectious Diseases R&D Regulatory Main navigation - Mobile Pharma M&A Regulatory Financials Corporate Legal Manufacturing M&A Outsourcing Regulatory Supply Chain Partnering Drug Safety Marketing Regulatory DTC Advertising Digital and Social Media Data and Analytics Launches Pharma Asia M&A R&D Regulatory Sales and Marketing Financials Manufacturing Animal Health R&D M&A Regulatory Veterinarian Financials Vaccines Drug Delivery R&D Regulatory Partnering Vac

In [48]:
type(texts[0]) # Be either bs4.element.Doctype or bs4.element.NavigatableString
texts[94]

2

In [67]:
# OBJECTIVE: Read html_data.csv in chunks, extract title from html-page, and add the title to train.csv as new column

if(False): # Set False because the HTML is already loaded,parsed and persisted in DB for convenience
    for ddf in pd.read_csv('bigdata/train/html_data.csv', 
                           index_col='Webpage_id', 
                           iterator=True , 
                           chunksize=1000, 
                           encoding='utf-8'):
        start = ddf.head(1).index.values[0]
        end = ddf.tail(1).index.values[0]
        if(start<75000): continue
        print("Processing records from {0} to {1}".format(start, end))
        ddf['Title'] = ddf.apply(lambda row: extract_title(row.Html), axis=1)
        ddf['Html2Text'] = ddf.apply(lambda row : extract_text(row.Html), axis=1)
#         ddf[['Title','Html2Text']].to_csv('bigdata/debugging.csv', encoding='utf-8')
        ddf[['Title','Html2Text']].to_sql('webpage_table',db_engine, if_exists='append') #Write to DB to save time/memory
#         df.merge(ddf[['Title']], how='inner', left_index=True, right_index=True)
#         df['Title'] = ddf.apply(transform_row, axis=1)    

Processing records from 75001 to 76000
Processing records from 76001 to 77000
Processing records from 77001 to 78000
Processing records from 78001 to 79000
Processing records from 79001 to 79345


In [68]:
# OBJECTIVE: Sanity check of the data persistence of the data that was transformed from html_data.csv

# ddf[['Title']].to_sql('webpage_table',db_engine, if_exists='append')
# db_engine.execute('DROP TABLE IF EXISTS webpage_table')
# result = db_engine.execute('select count(*) from webpage_table')
result = db_engine.execute('select * from webpage_table where Webpage_id<=5 or Webpage_id>79340')
for row in result:
    print('row: ', row)
result.close()
# Q: Data Pre-procecssing : How do we deal with titles in languages other than English??

row:  (1, "Tecfidera, Gilenya and Aubagio's 3-way battle for MS share is about to heat up | FiercePharma", 'Skip to main content Twitter LinkedIn Search Top Menu DDF 2017 FierceBiotech Jobs Resources Events Subscribe Main navigation Pharma M&A Regulatory Fi ... (7455 characters truncated) ... y About Us Contact © 2017 Questex LLC. All rights reserved. 275 Grove Street Suite 2-130 Newton MA 02466 Reproduction in whole or part is prohibited.')
row:  (2, 'Novo equipped to weather the storm in the U.S. diabetes market, CEO says | FiercePharma', 'Skip to main content Twitter LinkedIn Search Top Menu DDF 2017 FierceBiotech Jobs Resources Events Subscribe Main navigation Pharma M&A Regulatory Fi ... (5559 characters truncated) ... y About Us Contact © 2017 Questex LLC. All rights reserved. 275 Grove Street Suite 2-130 Newton MA 02466 Reproduction in whole or part is prohibited.')
row:  (3, "Another exec departs troubled Endo--and this time, it's for another drugmaker | FiercePharma", 'Skip to

In [69]:
# Read Procecssed Data into DataFrame
if(False): #You don't want to load a very large file from DB. Do you? Read the proccessed file train1.csv. See further below.
    pdf = pd.read_sql_table('webpage_table', db_engine, index_col='Webpage_id')
    print('Shape : ', pdf.shape)
    print('Head :\n', pdf.head())

Shape :  (79345, 2)
Head :
                                                         Title  \
Webpage_id                                                      
1           Tecfidera, Gilenya and Aubagio's 3-way battle ...   
2           Novo equipped to weather the storm in the U.S....   
3           Another exec departs troubled Endo--and this t...   
4           Would Teva buy Korea's Celltrion to beef up in...   
5           Restasis-maker Allergan recruits actress Maris...   

                                                    Html2Text  
Webpage_id                                                     
1           Skip to main content Twitter LinkedIn Search T...  
2           Skip to main content Twitter LinkedIn Search T...  
3           Skip to main content Twitter LinkedIn Search T...  
4           Skip to main content Twitter LinkedIn Search T...  
5           Skip to main content Twitter LinkedIn Search T...  


In [70]:
df.head()

Unnamed: 0_level_0,Domain,Url,Tag
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,fiercepharma,http://www.fiercepharma.com/marketing/tecfider...,news
2,fiercepharma,http://www.fiercepharma.com/pharma/novo-equipp...,news
3,fiercepharma,http://www.fiercepharma.com/pharma/another-exe...,news
4,fiercepharma,http://www.fiercepharma.com/pharma/teva-buy-bi...,news
5,fiercepharma,http://www.fiercepharma.com/marketing/actress-...,news


In [71]:
# new df
ndf = pd.merge(df,pdf,on='Webpage_id')
print('Shape of new DF :', ndf.shape)
ndf.head()

Shape of new DF : (53447, 5)


Unnamed: 0_level_0,Domain,Url,Tag,Title,Html2Text
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,fiercepharma,http://www.fiercepharma.com/marketing/tecfider...,news,"Tecfidera, Gilenya and Aubagio's 3-way battle ...",Skip to main content Twitter LinkedIn Search T...
2,fiercepharma,http://www.fiercepharma.com/pharma/novo-equipp...,news,Novo equipped to weather the storm in the U.S....,Skip to main content Twitter LinkedIn Search T...
3,fiercepharma,http://www.fiercepharma.com/pharma/another-exe...,news,Another exec departs troubled Endo--and this t...,Skip to main content Twitter LinkedIn Search T...
4,fiercepharma,http://www.fiercepharma.com/pharma/teva-buy-bi...,news,Would Teva buy Korea's Celltrion to beef up in...,Skip to main content Twitter LinkedIn Search T...
5,fiercepharma,http://www.fiercepharma.com/marketing/actress-...,news,Restasis-maker Allergan recruits actress Maris...,Skip to main content Twitter LinkedIn Search T...


In [72]:
# Persist the Merged DataFrame (having additional columns - Title, Html2Text) to CSV
ndf = ndf[['Domain', 'Url', 'Title', 'Html2Text', 'Tag']]
ndf.to_csv('bigdata/train1.csv',encoding='utf-8')
ndf.head(5)

Unnamed: 0_level_0,Domain,Url,Title,Html2Text,Tag
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,fiercepharma,http://www.fiercepharma.com/marketing/tecfider...,"Tecfidera, Gilenya and Aubagio's 3-way battle ...",Skip to main content Twitter LinkedIn Search T...,news
2,fiercepharma,http://www.fiercepharma.com/pharma/novo-equipp...,Novo equipped to weather the storm in the U.S....,Skip to main content Twitter LinkedIn Search T...,news
3,fiercepharma,http://www.fiercepharma.com/pharma/another-exe...,Another exec departs troubled Endo--and this t...,Skip to main content Twitter LinkedIn Search T...,news
4,fiercepharma,http://www.fiercepharma.com/pharma/teva-buy-bi...,Would Teva buy Korea's Celltrion to beef up in...,Skip to main content Twitter LinkedIn Search T...,news
5,fiercepharma,http://www.fiercepharma.com/marketing/actress-...,Restasis-maker Allergan recruits actress Maris...,Skip to main content Twitter LinkedIn Search T...,news


In [73]:
ndf.isnull().sum() # Sanity check for null existence and we got  99 null value Titles.

Domain        0
Url           0
Title        99
Html2Text     0
Tag           0
dtype: int64

In [74]:
null_title_ids = ndf[ndf.Title.isnull()].index.values
print(null_title_ids)

[10606 10630 10637 10641 10642 10644 10647 10651 10711 10712 10747 10881
 10941 13126 13364 17847 22180 23707 28277 28278 28827 28828 28829 28830
 28831 28832 28833 28871 28886 28890 29405 30412 30822 31165 31515 31519
 40057 40849 52767 52780 52781 54462 54463 56238 56239 57686 57688 57692
 57693 57696 57697 57703 65809 66069 66123 66139 66170 67007 70246 70248
 70263 70264 70266 70267 70270 70832 73889 73946 73952 73955 73991 74030
 74048 74070 74201 74212 74222 74229 74317 74340 74375 74382 74388 74402
 74472 74485 74572 74577 74610 74631 74718 74869 74938 74978 77662 77757
 77867 77912 77993]


In [81]:
null_titles = ndf[ndf.Title.isnull()]
null_titles[['Url','Title']]
# Observation: URLs return 404 or access_dednied or a PDF doc

Unnamed: 0_level_0,Url,Title
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10606,http://journals.sagepub.com/doi/pdf/10.1177/20...,
10630,http://www.health.govt.nz/system/files/documen...,
10637,http://www.health.govt.nz/system/files/documen...,
10641,http://www.cieh.org/assets/0/72/1126/1212/1216...,
10642,https://www.health.govt.nz/system/files/docume...,
10644,http://www.health.govt.nz/system/files/documen...,
10647,https://www.karger.com/Article/Pdf/367759,
10651,https://link.springer.com/content/pdf/10.1007%...,
10711,http://files.abstractsonline.com/SUPT/101/4292...,
10712,http://www.practicalradonc.org/article/S1879-8...,


In [82]:
# How to cleanse or fillna the Title column with None values???
# Replace None in Title column with Empty strings. Later this helps in vectorizing this column
# Change approach? I think these rows should have been purged/deleted. Because non-existant page add no business value, right?
ndf.Title.fillna('',inplace=True)
ndf.isnull().sum()

Domain       0
Url          0
Title        0
Html2Text    0
Tag          0
dtype: int64

## Data Modeling

In [29]:
from sklearn.model_selection import train_test_split, ShuffleSplit
y = ndf.pop('Tag')
x = ndf

In [30]:
x.pop('Url')
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42,shuffle=True,stratify=y)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_domain = TfidfVectorizer()
vectorizer_title = TfidfVectorizer()

vectors_domain = vectorizer_domain.fit_transform(x_train.Domain)
print(vectors_domain.shape)

vectors_title = vectorizer_title.fit_transform(x_train.Title)
print(vectors_title.shape)

(37412, 3572)
(37412, 33830)


In [32]:
x_train.head()

Unnamed: 0_level_0,Domain,Title
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1
49734,ca.gsk.com,GSK to discontinue manufacture and sale of the...
62253,www.msd.com,The Impact of Cancer
70502,www.detc.uk,News Archive - The Digital Engineering & Test ...
75620,www.health.com,Why Does Sugar Make You Thirsty? - Health
78138,www.urmc.rochester.edu,"Jefferson S. Svengsouk, M.D., M.B.A. - Univer..."


In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectors_domain_test = vectorizer_domain.transform(x_test.Domain)
vectors_title_test = vectorizer_title.transform(x_test.Title)

clf = MultinomialNB(alpha=.01)

In [34]:
clf.fit(vectors_domain, y_train)

y_pred = clf.predict(vectors_domain_test)

metrics.f1_score(y_test, y_pred, average='macro')

0.901020902082288

## Data Modeling : Predicting Test Data Classification

In [35]:
testdf = pd.read_csv('bigdata/test.csv', index_col='Webpage_id')
# testdf.drop(columns=['Url'], inplace=True)
print(testdf.shape)
testdf.head()

(25787, 2)


Unnamed: 0_level_0,Domain,Url
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1
31,isrctn.com,http://www.isrctn.com/ISRCTN57801413
32,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...
33,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...
34,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...
35,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...


In [36]:
vectors_domain_testdf = vectorizer_domain.transform(testdf.Domain)
y_pred_testdf = clf.predict(vectors_domain_testdf)
y_pred_testdf

array(['clinicalTrials', 'thesis', 'thesis', ..., 'forum', 'forum',
       'forum'], dtype='<U14')

In [37]:
data = {
    'Webpage_id': testdf.index.values,
    'Tag': y_pred_testdf
}
testdf_pred = pd.DataFrame(data)
testdf_pred.set_index('Webpage_id', inplace=True)
testdf_pred.to_csv('bigdata/test_pred_1.csv')

In [38]:
# ndf = ndf.reset_index()
# indices = testdf.index.values
# ndf.loc[ndf.Webpage_id.isin(indices)]

In [39]:
# Add 'Title' column to testdf, so that its values can be derived from the corresponding urls in the Domain column of each row
testdf['Title'] = ''
testdf['Html'] = ''
# testdf = testdf.reset_index()
# testdf = testdf.set_index('Webpage_id')
# testdf
# testdf.loc[testdf.Webpage_id == 31] #['Title'] = 'test'
# testdf.at[31,'Title'] = ''
print(testdf.shape)
testdf.head(3)

(25787, 4)


Unnamed: 0_level_0,Domain,Url,Title,Html
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31,isrctn.com,http://www.isrctn.com/ISRCTN57801413,,
32,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,,
33,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,,


In [40]:
testdf.to_sql('testdf',db_engine, if_exists='replace') #Write to DB to save time/memory

In [None]:
# OBJECTIVE: Parse web-page to extract titles
'''
import requests
from bs4 import BeautifulSoup

# urls = testdf.Url.values
# print(urls.size) # 25787 tallies with test dataset
# for idx,row in testdf.iterrows():
#     print(idx,'\n',row)
#     break

for idx,row in testdf.iterrows():
#     page = requests.get(row.Url)
    title = extract_title(row['Html'])
    print(title)
    if (title != None): 
        testdf.at[idx,'Title'] = title
print('Done computing titles..')  
'''

In [None]:
testdf.head()

In [None]:
clf.fit(vectors_title, y_train)

vectors_title_testdf = vectorizer_title.transform(testdf.Title)
y_pred_testdf = clf.predict(vectors_title_testdf)
data = {
    'Webpage_id': testdf.index.values,
    'Tag': y_pred_testdf
}
testdf_pred = pd.DataFrame(data)
testdf_pred.set_index('Webpage_id', inplace=True)
testdf_pred.to_csv('bigdata/test_pred_2.csv')