# Data Preprocessing

In [91]:
%matplotlib inline
import numpy as np
import pandas as pd
import dask.dataframe as dd
from bs4 import BeautifulSoup # For Scraping HTML page

In [126]:
from sqlalchemy import create_engine # sqlalchemy = ORM library
from sqlalchemy_utils.functions import database_exists, drop_database
import os
cwd = os.getcwd() # current working  directory

# Create an engine that stores data in the local directory's
# sqlalchemy_example.db file.
db_engine = create_engine('sqlite:///avdb.db')
# db_path = os.path.realpath( os.path.dirname(str(engine.url)) )
url = db_engine.url
print(url)
print(database_exists(url))
# drop_database(url)
# print(database_exists(url))

sqlite:///avdb.db
False


In [3]:
train_data = 'bigdata/train.csv'
html_data = 'bigdata/train/html_data.csv'

In [4]:
df = pd.read_csv(train_data, index_col='Webpage_id')
df.head()

Unnamed: 0_level_0,Domain,Url,Tag
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfider...,news
2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipp...,news
3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exe...,news
4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-bi...,news
5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-...,news


In [5]:
df.isnull().sum() # No null values, for good..Sanity check ok!

Domain    0
Url       0
Tag       0
dtype: int64

In [6]:
# df.drop(columns=['Url'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53447 entries, 1 to 79345
Data columns (total 3 columns):
Domain    53447 non-null object
Url       53447 non-null object
Tag       53447 non-null object
dtypes: object(3)
memory usage: 1.6+ MB


In [7]:
# Convert columns to CATEGORY type to reduce memory footprint
df.Domain =  df.Domain.astype('category')
df.Tag = df.Tag.astype('category')
df.info() # Size reduced from 1.6+ to 1.2+

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53447 entries, 1 to 79345
Data columns (total 3 columns):
Domain    53447 non-null category
Url       53447 non-null object
Tag       53447 non-null category
dtypes: category(2), object(1)
memory usage: 1.2+ MB


In [25]:
# What are the different Target Classes, we have?

print(sorted(df.Tag.unique()))
# ['clinicalTrials', 'conferences', 'forum', 'guidelines', 'news', 'others', 'profile', 'publication', 'thesis']

df.Tag.value_counts()

['clinicalTrials', 'conferences', 'forum', 'guidelines', 'news', 'others', 'profile', 'publication', 'thesis']


others            17417
news               7992
publication        7705
profile            5196
conferences        4666
forum              4503
clinicalTrials     2839
thesis             1800
guidelines         1329
Name: Tag, dtype: int64

In [9]:
df.Domain.value_counts()

thesis.library.caltech.edu                301
ecommons.cornell.edu                      300
dspace.mit.edu                            300
curate.nd.edu                             300
academiccommons.columbia.edu              300
www.dart-europe.eu                        300
www.nice.org.uk                           230
www.ncbi.nlm.nih.gov                      226
www.australiancancertrials.gov.au         209
ctri.nic.in                               209
rctportal.niph.go.jp                      206
slctr.lk                                  203
en.search.irct.ir                         201
www.anzctr.org.au                         200
clinicaltrials.gov                        200
www.isrctn.com                            200
www.medbox.org                            200
upload.umin.ac.jp                         200
cris.nih.go.kr                            200
www.chictr.org.cn                         200
www.clinicalguidelines.gov.au             200
www.trialregister.nl              

In [10]:
# df[df.Domain.str.match('chictr')]
df[df.Domain.str.contains("health")]['Domain'].unique()

[www.uofmhealth.org, www.sidneyhealth.org, www.sovhealth.com, www.healthcare4ppl.com, www.healthgrades.com, ..., jobs.sidneyhealth.org, learnabouthealthnow.blogspot.com.cy, margaret.healthblogs.org, healthcare.utah.edu, findadoc.ariahealth.org]
Length: 112
Categories (112, object): [www.uofmhealth.org, www.sidneyhealth.org, www.sovhealth.com, www.healthcare4ppl.com, ..., learnabouthealthnow.blogspot.com.cy, margaret.healthblogs.org, healthcare.utah.edu, findadoc.ariahealth.org]

In [11]:
# Sample data from HTML DataFrame
hdf = pd.read_csv('bigdata/train/html_data.csv', nrows=10) # Read just 10 rows
hdf

Unnamed: 0,Webpage_id,Html
0,1,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
1,2,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
2,3,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
3,4,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
4,5,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
5,6,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
6,7,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
7,8,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
8,9,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
9,10,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."


In [69]:
# Read html_data.csv using pandas in chunks.
# The chunksize parameter refers to the number of rows per chunk.
row_count = 100
txtFileReader = pd.read_csv('bigdata/train/html_data.csv', chunksize=row_count, iterator=True, index_col='Webpage_id')
ddf = txtFileReader.get_chunk()

Unnamed: 0_level_0,Html
Webpage_id,Unnamed: 1_level_1
1,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
2,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
3,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
4,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
5,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
6,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
7,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
8,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
9,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."
10,"<!DOCTYPE html>\n<html lang=""en"" dir=""ltr"" xml..."


In [44]:
# import csv
# hdf = dd.read_csv(html_data, dtype='str')
# hdf = dd.read_csv(html_data, quoting=csv.QUOTE_NONE, encoding='utf-8', dtype='str')
# hdf = dd.read_csv(html_data, engine='python', encoding='utf-8')

# hdf = dd.from_pandas(pd.read_csv(html_data), chunksize=10)
# hdf.set_index(hdf.Webpage_id)
page = hdf.head(1)['Html'].values[0]

In [41]:
df.iloc[1]

Domain                                 www.fiercepharma.com
Url       http://www.fiercepharma.com/pharma/novo-equipp...
Tag                                                    news
Name: 2, dtype: object

In [89]:
# Objective : Extract text frfom title tag of HTML source of web-page
def extract_title(page):
    # parse the html using beautiful soup and store in variable `soup`
    if (page == None): 
        return None
    soup = BeautifulSoup(page, 'html.parser')
    title_tag = soup.find('title')
    if (title_tag == None):
        title = None
    else:
        title = title_tag.text.strip()
    return title

# Test method definition
print(extract_title(page))
print(extract_title("<html></html>"))

Tecfidera, Gilenya and Aubagio's 3-way battle for MS share is about to heat up | FiercePharma
None


In [136]:
# OBJECTIVE: Read html_data.csv in chunks, extract title from html-page, and add the title to train.csv as new column
def transform_row(row):
    return extract_title(row.Html)

if(False): # Set False because the HTML is already loaded,parsed and persisted in DB for convenience
    for ddf in pd.read_csv('bigdata/train/html_data.csv', index_col='Webpage_id', iterator=True , chunksize=1000):
        ddf['Title'] = ddf.apply(transform_row, axis=1)
        ddf[['Title']].to_sql('webpage_table',db_engine, if_exists='append') #Write to DB to save time/memory
#         df.merge(ddf[['Title']], how='inner', left_index=True, right_index=True)
#         df['Title'] = ddf.apply(transform_row, axis=1)

print('-'*120)
df.head()    

------------------------------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,Domain,Url,Tag
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfider...,news
2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipp...,news
3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exe...,news
4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-bi...,news
5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-...,news


In [128]:
ddf[['Title']].head() # Will only have the data from last loaded dataframe

Unnamed: 0_level_0,Title
Webpage_id,Unnamed: 1_level_1
79001,Matthew Maxwell | Department of Anesthesia
79002,Rashmi Mueller | Department of Anesthesia
79003,Magboul Magboul | Department of Anesthesia
79004,Anil Marian | Department of Anesthesia
79005,Ngoc Tran Nguyen | Department of Anesthesia


In [95]:
tmp = ddf[['Title']]
tmp.shape

(345, 1)

In [145]:
# OBJECTIVE: Sanity check of the data persistence of the data that was transformed from html_data.csv

# ddf[['Title']].to_sql('webpage_table',db_engine, if_exists='append')
# db_engine.execute('DROP TABLE IF EXISTS webpage_table')
# result = db_engine.execute('select count(*) from webpage_table')
result = db_engine.execute('select * from webpage_table where Webpage_id<=50 or Webpage_id>79340')
for row in result:
    print('row: ', row)
result.close()

row:  (1, "Tecfidera, Gilenya and Aubagio's 3-way battle for MS share is about to heat up | FiercePharma")
row:  (2, 'Novo equipped to weather the storm in the U.S. diabetes market, CEO says | FiercePharma')
row:  (3, "Another exec departs troubled Endo--and this time, it's for another drugmaker | FiercePharma")
row:  (4, "Would Teva buy Korea's Celltrion to beef up in biosimilars? It wouldn't say no | FiercePharma")
row:  (5, 'Restasis-maker Allergan recruits actress Marisa Tomei to drive dry eye awareness | FiercePharma')
row:  (6, "NICE backs 'less-effective' Otezla to give psoriatic arthritis patients a pill option | FiercePharma")
row:  (7, "Mylan takes on punching-bag role again as Clinton aims to shame 'bad actors' | FiercePharma")
row:  (8, "Failed Eylea combo trial throws a wrench in Regeneron's expansion effort | FiercePharma")
row:  (9, 'Should GSK investors be disgruntled about its insider CEO pick? If they want big changes, yes | FiercePharma')
row:  (10, 'A silver lining 

In [148]:
# Read Procecssed Data into DataFrame
pdf = pd.read_sql_table('webpage_table', db_engine, index_col='Webpage_id')
pdf.shape

(79345, 1)

In [150]:
pdf.head()

Unnamed: 0_level_0,Title
Webpage_id,Unnamed: 1_level_1
1,"Tecfidera, Gilenya and Aubagio's 3-way battle ..."
2,Novo equipped to weather the storm in the U.S....
3,Another exec departs troubled Endo--and this t...
4,Would Teva buy Korea's Celltrion to beef up in...
5,Restasis-maker Allergan recruits actress Maris...


In [152]:
df.head()

Unnamed: 0_level_0,Domain,Url,Tag
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfider...,news
2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipp...,news
3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exe...,news
4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-bi...,news
5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-...,news


In [190]:
# OBJECTIVE : Merge the Title column of pdf dataframe into the df dataframe

# A little code play for trying it out below:
raw_data = {
    'subject_id': [11, 12, 13, 14, 15],
    'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
    'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']
}
df_a = pd.DataFrame(raw_data, columns = ['subject_id','first_name', 'last_name'])
df_a.set_index('subject_id', inplace=True)
df_a

raw_data = {
        'subject_id': [11, 12, 13, 14, 15, 17, 18, 19, 20, 21],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]
}
df_b = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
df_b = df_b.set_index('subject_id')
df_b

pd.merge(df_a,df_b,on='subject_id')
# df_a.merge(df_b, on='subject_id')
# df_a.merge(df_b,how='inner', left_index=True, right_index=True)
# df_a.merge(df_b,how='left', left_index=True, right_index=True)
# df_a.merge(df_b,how='right', left_index=True, right_index=True)
# df_a.merge(df_b,how='outer', left_index=True, right_index=True)
# pd.concat([df_a,df_b], axis=1)

Unnamed: 0_level_0,first_name,last_name,test_id
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,Alex,Anderson,51
12,Amy,Ackerman,15
13,Allen,Ali,15
14,Alice,Aoni,61
15,Ayoung,Atiches,16


In [193]:
# new df
ndf = pd.merge(df,pdf,on='Webpage_id')
print('Shape of new DF :', ndf.shape)
ndf.head()

Shape of new DF : (53447, 4)


Unnamed: 0_level_0,Domain,Url,Tag,Title
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfider...,news,"Tecfidera, Gilenya and Aubagio's 3-way battle ..."
2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipp...,news,Novo equipped to weather the storm in the U.S....
3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exe...,news,Another exec departs troubled Endo--and this t...
4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-bi...,news,Would Teva buy Korea's Celltrion to beef up in...
5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-...,news,Restasis-maker Allergan recruits actress Maris...


In [227]:
# Persist the Merged DataFrame (having additional Title column) to CSV
ndf = ndf[['Domain', 'Url', 'Title', 'Tag']]
ndf.to_csv('bigdata/train1.csv')
ndf

Unnamed: 0_level_0,Domain,Url,Title,Tag
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,www.fiercepharma.com,http://www.fiercepharma.com/marketing/tecfider...,"Tecfidera, Gilenya and Aubagio's 3-way battle ...",news
2,www.fiercepharma.com,http://www.fiercepharma.com/pharma/novo-equipp...,Novo equipped to weather the storm in the U.S....,news
3,www.fiercepharma.com,http://www.fiercepharma.com/pharma/another-exe...,Another exec departs troubled Endo--and this t...,news
4,www.fiercepharma.com,http://www.fiercepharma.com/pharma/teva-buy-bi...,Would Teva buy Korea's Celltrion to beef up in...,news
5,www.fiercepharma.com,http://www.fiercepharma.com/marketing/actress-...,Restasis-maker Allergan recruits actress Maris...,news
6,www.fiercepharma.com,http://www.fiercepharma.com/pharma/celgene-s-o...,NICE backs 'less-effective' Otezla to give pso...,news
7,www.fiercepharma.com,http://www.fiercepharma.com/pharma/mylan-takes...,Mylan takes on punching-bag role again as Clin...,news
8,www.fiercepharma.com,http://www.fiercepharma.com/marketing/failed-e...,Failed Eylea combo trial throws a wrench in Re...,news
9,www.fiercepharma.com,http://www.fiercepharma.com/pharma/should-gsk-...,Should GSK investors be disgruntled about its ...,news
10,www.fiercepharma.com,http://www.fiercepharma.com/pharma/drug-pricin...,A silver lining to the drug-pricing uproar? Fo...,news


In [197]:
ndf.isnull().sum() # Sanity check for null existence and we got  99 null value Titles.

Domain     0
Url        0
Title     99
Tag        0
dtype: int64

In [212]:

null_title_ids = ndf[ndf.Title.isnull()].index.values
print(null_title_ids)

[10606 10630 10637 10641 10642 10644 10647 10651 10711 10712 10747 10881
 10941 13126 13364 17847 22180 23707 28277 28278 28827 28828 28829 28830
 28831 28832 28833 28871 28886 28890 29405 30412 30822 31165 31515 31519
 40057 40849 52767 52780 52781 54462 54463 56238 56239 57686 57688 57692
 57693 57696 57697 57703 65809 66069 66123 66139 66170 67007 70246 70248
 70263 70264 70266 70267 70270 70832 73889 73946 73952 73955 73991 74030
 74048 74070 74201 74212 74222 74229 74317 74340 74375 74382 74388 74402
 74472 74485 74572 74577 74610 74631 74718 74869 74938 74978 77662 77757
 77867 77912 77993]


In [218]:
# Do we have  these Webpage_Ids in html_data.csv??? No, unfortunately.
null_title_htmls = hdf[hdf['Webpage_id'].isin(null_title_ids)]
null_title_htmls # Empty list. That implies 99 rows in test data without Titles have no corresponding rows in html_data.csv

Unnamed: 0,Webpage_id,Html


In [226]:
null_titles = ndf[ndf.Title.isnull()]
null_titles['Url']
# Observation: URLs return 404 or access_dednied or a PDF doc

Webpage_id
10606    http://journals.sagepub.com/doi/pdf/10.1177/20...
10630    http://www.health.govt.nz/system/files/documen...
10637    http://www.health.govt.nz/system/files/documen...
10641    http://www.cieh.org/assets/0/72/1126/1212/1216...
10642    https://www.health.govt.nz/system/files/docume...
10644    http://www.health.govt.nz/system/files/documen...
10647            https://www.karger.com/Article/Pdf/367759
10651    https://link.springer.com/content/pdf/10.1007%...
10711    http://files.abstractsonline.com/SUPT/101/4292...
10712    http://www.practicalradonc.org/article/S1879-8...
10747    http://www.europsy.net/wp-content/uploads/2016...
10881    https://www.ics.org/Publications/ICI_4/files-b...
10941    http://deutscher-schmerzkongress2014.de/wp-con...
13126    https://www.nice.org.uk/guidance/ng18/resource...
13364    https://derbyhospitals-nhs.archive.knowledgear...
17847    http://static.smallworldlabs.com/molmedcommuni...
22180    http://www.kardiologiapolska.pl/en/d

In [249]:
from sklearn.model_selection import train_test_split, ShuffleSplit
y = ndf.pop('Tag')
x = ndf

In [251]:
# How to cleanse or fillna the Title column with None values???
# Replace None in Title column with Empty strings. Later this helps in vectorizing this column
# Change approach? I think these rows should have been purged/deleted. Because non-existant page add no business value, right?
x.Title.fillna('',inplace=True)
x.isnull().sum()

Domain    0
Title     0
dtype: int64

In [253]:
x.pop('Url')
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42,shuffle=True,stratify=y)

In [254]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_domain = TfidfVectorizer()
vectorizer_title = TfidfVectorizer()

vectors_domain = vectorizer_domain.fit_transform(x_train.Domain)
print(vectors_domain.shape)

vectors_title = vectorizer_title.fit_transform(x_train.Title)
print(vectors_title.shape)

(37412, 3572)
(37412, 33830)


In [255]:
x_train.head()

Unnamed: 0_level_0,Domain,Title
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1
49734,ca.gsk.com,GSK to discontinue manufacture and sale of the...
62253,www.msd.com,The Impact of Cancer
70502,www.detc.uk,News Archive - The Digital Engineering & Test ...
75620,www.health.com,Why Does Sugar Make You Thirsty? - Health
78138,www.urmc.rochester.edu,"Jefferson S. Svengsouk, M.D., M.B.A. - Univer..."


In [256]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectors_domain_test = vectorizer_domain.transform(x_test.Domain)
vectors_title_test = vectorizer_title.transform(x_test.Title)

clf = MultinomialNB(alpha=.01)

In [257]:
clf.fit(vectors_domain, y_train)

y_pred = clf.predict(vectors_domain_test)

metrics.f1_score(y_test, y_pred, average='macro')

0.901020902082288

## Predicting Test Data Classification

In [281]:
testdf = pd.read_csv('bigdata/test.csv', index_col='Webpage_id')
# testdf.drop(columns=['Url'], inplace=True)
print(testdf.shape)
testdf.head()

(25787, 2)


Unnamed: 0_level_0,Domain,Url
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1
31,isrctn.com,http://www.isrctn.com/ISRCTN57801413
32,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...
33,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...
34,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...
35,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...


In [260]:
vectors_domain_testdf = vectorizer_domain.transform(testdf.Domain)
y_pred_testdf = clf.predict(vectors_domain_testdf)
y_pred_testdf

array(['clinicalTrials', 'thesis', 'thesis', ..., 'forum', 'forum',
       'forum'], dtype='<U14')

In [264]:
data = {
    'Webpage_id': testdf.index.values,
    'Tag': y_pred_testdf
}
testdf_pred = pd.DataFrame(data)
testdf_pred.set_index('Webpage_id', inplace=True)
testdf_pred.to_csv('bigdata/test_pred_1.csv')

In [279]:
ndf = ndf.reset_index()
indices = testdf.index.values
ndf.loc[ndf.Webpage_id.isin(indices)]

Unnamed: 0,index,Webpage_id,Domain,Title


In [300]:
testdf['Title'] = ''
# testdf = testdf.reset_index()
# testdf = testdf.set_index('Webpage_id')
# testdf
# testdf.loc[testdf.Webpage_id == 31] #['Title'] = 'test'
# testdf.at[31,'Title'] = ''
testdf.head(3)

Unnamed: 0_level_0,Domain,Url,Title
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31,isrctn.com,http://www.isrctn.com/ISRCTN57801413,
32,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,
33,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,


In [302]:
import requests
from bs4 import BeautifulSoup

# urls = testdf.Url.values
# print(urls.size) # 25787 tallies with test dataset
# for idx,row in testdf.iterrows():
#     print(idx,'\n',row)
#     break

for idx,row in testdf.iterrows():
    page = requests.get(row.Url)
    title = extract_title(page.text)
    print(title)
    if (title != None): 
        testdf.at[idx,'Title'] = title
print('Done computing titles..')        

ISRCTN - ISRCTN57801413: Development and validation of a caregiver quality of life questionnaire for wheezy preschool children
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Kofam |
Kofam |
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Kofam |
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Kofam | Studienregister SNCTP
ISRCTN - ISRCTN68055393: Comparison of transnasal oesophagoscopy versus standard care for patients presenting with throat symptoms
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
ISRCTN - ISRCTN16006202: Intensified household contact tracing, prevention and treatment support versus enhanced standard of care for inde

GSK drops a pair of late-stage candidates in COPD, HIV | FierceBiotech
IPOs: Ra Pharma raises $92M, as Pfizer to gain rights to Myovant | FierceBiotech
ISRCTN - ISRCTN54275632: Triveram in patients with hypertension and concomitant primary hypercholesterolemia or mixed hyperlipidemia
ISRCTN - ISRCTN13911492: Evaluating the benefits of stocking and heparin in DVT prevention
ISRCTN - ISRCTN51957280: ICARE-PREVENT: Effectiveness of an Internet based intervention for eating disorders and obesity for adolescents in school setting
Oasmia buys estrogen receptor cancer drug, raises cash  | FierceBiotech
ISRCTN - ISRCTN14315618: NNTV: NSA Nutritional supplementation Trial of fruit and vegetable extracts and Vascular function
ISRCTN - ISRCTN10605140: Pharmacy based screening of high risk individuals using stepwise methods
ISRCTN - ISRCTN46069848: Community-based heart and weight management trial ('Heart and Weightless')
Intercept’s Ocaliva gains PBC thumbs in Europe, but with caveats  | FierceBi

Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Clinical Trials Register
Kofam |
Kofam | Studienregister SNCTP
ISRCTN - ISRCTN64760291: Pharmacokinetics of two different formulations of finasteride (topical and oral) in male volunteers with androgeneic alopecia
Clinical Trials Register
ISRCTN - ISRCTN88327135: Evaluation of Chatterbooks and Chatterbooks Plus  improving reading ability in Year 7 pupils with low reading ability
ISRCTN - ISRCTN16513449: The Children and 

ISRCTN - ISRCTN45060587: An open label phase I study in healthy subjects with blood group AB to investigate the safety, tolerability and efficacy of Uniplas™ LG
Clinical Trials Register
ISRCTN - ISRCTN01751283: Pulmonary Hypertension in the Intensive Care Unit (ICU) - Swiss Survey 1
ISRCTN - ISRCTN80844630: Omega 3 fatty acid for prevention of sickle cell crisis
ISRCTN - ISRCTN46781597: The effect of pregabalin on post-operative pain and recovery after kidney transplantation [Pregabaliinin vaikutus leikkauskipuun ja toipumiseen munuaisensiirtopotilailla]
ISRCTN - ISRCTN24271997: A comparison of the need for opiate pain relief medication after elective hip replacement surgery between patients given Local Anaesthetic Infiltration to the new joint and then an infusion of either local anaesthetic or placebo around the new joint
ISRCTN - ISRCTN96352204: Music in Mind
ISRCTN - ISRCTN51295799: An investigation of the efficacy of a single dose of insulin in the prevention of excessive cutaneou

SSLError: HTTPSConnectionPool(host='media.wholefoodsmarket.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))

In [303]:
testdf.head()

Unnamed: 0_level_0,Domain,Url,Title
Webpage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31,isrctn.com,http://www.isrctn.com/ISRCTN57801413,ISRCTN - ISRCTN57801413: Development and valid...
32,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,Clinical Trials Register
33,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,Clinical Trials Register
34,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,Clinical Trials Register
35,www.clinicaltrialsregister.eu,https://www.clinicaltrialsregister.eu/ctr-sear...,Clinical Trials Register


In [304]:
clf.fit(vectors_title, y_train)

vectors_title_testdf = vectorizer_title.transform(testdf.Title)
y_pred_testdf = clf.predict(vectors_title_testdf)
data = {
    'Webpage_id': testdf.index.values,
    'Tag': y_pred_testdf
}
testdf_pred = pd.DataFrame(data)
testdf_pred.set_index('Webpage_id', inplace=True)
testdf_pred.to_csv('bigdata/test_pred_2.csv')