Import basic packages

In [144]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd

Define scraping functions

In [145]:
def GetAbstract(link):
    # get abstract, keywords, and title given the url for an article
    
    # connect and get text from the website
    response = requests.get(link)
    
    # build a soup to parse the text data
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # find abstract
    pretextabstract = soup.find_all("p", class_= "articleBody_abstractText")
    
    # find keywords
    pretextkeywords = soup.find_all('meta', attrs = {'name' : 'keywords'})
    
    # return values after removing formatting stuff at the beginning and end
    return str(pretextabstract)[37:-5], str(pretextkeywords)[16:-20], soup.title.string

In [146]:
def GetArticles(issue):
    # find all articles given the url for an issue
    
    # connect and get text from the website
    response = requests.get(issue)
    
    # get the subset of data that contains articles (rather than perspectives or reviews) and make the soup
    soup = BeautifulSoup(response.text[response.text.find('<div class="subject">Articles</div> </h3>'):
          response.text.find('<div class="subject">Additions and Corrections</div>')], 'html.parser')
    
    # get all the relevant links
    temp = soup.find_all('div', class_='art_title linkable')
    
    # extract the links into an output list
    outputarticles = [] # initialize list
    for i in temp:
        tempstring = str(i) # convert to a string
        temptempstring = tempstring[tempstring.find('href'):] # find the beginning of the link
        
        # find the end of string and add it to the list
        outputarticles.append(temptempstring[temptempstring.find('/'):temptempstring.find('>') - 1]) 
        
    return outputarticles

In [147]:
def ScrapeIssue(volume,issue):
    # scrape an issue of ACS Nano given an volume and issue
    
    # get the list of articles
    articles = GetArticles('https://pubs.acs.org/toc/ancac3/' + str(volume) + '/' + str(issue))
    
    # initialize output dataframe
    outputdata = pd.DataFrame(columns = ['Title', 'Abstract', 'Keywords'])
    
    # scrape each article
    for article in articles:
        (ta,tk,tt) = GetAbstract('https://pubs.acs.org' + article)
        outputdata=outputdata.append(pd.DataFrame(index = [tt], data = {'Title' : tt, 
                                                                        'Abstract' : ta, 'Keywords' : tk}))
        
    return outputdata

Actual scrapping

In [7]:
# scrape volumes 8 through 12
for v in range(8, 12):
    for i in range(1, 13):
        temp = ScrapeIssue(v,i) # scrape the issue
        temp.to_csv('ACSNano_' + str(v) + '_' + str(i) + '.csv', index = False) # save it as a csv
        time.sleep(60*10) # wait ten minutes to avoid too many requests to the ACS website
        
# scrape volume 13, where there are currently only two issues
v=13
for i in range(1, 3):
    temp=ScrapeIssue(v, i) # scrape the issue
    temp.to_csv('ACSNano_' + str(v) + '_' + str(i) + '.csv', index = False) # save it as a csv
    time.sleep(60*10) # wait ten minutes

Combine into a single csv

In [34]:
# initialize dataframe
b = pd.DataFrame()

# import all the data
for v in range(8, 13):
    for i in range(1, 13):
        # load in the data
        temp = pd.read_csv('ACSNano_' + str(v) + '_' + str(i) + '.csv', encoding = 'unicode_escape')
        
        # set info about where the data came from
        temp['Year'] = v + 2006
        temp['Issue'] = v
        temp['Volume'] = i
        
        # add it to the dataframe
        b=b.append(temp)
        
# do the same thing for volume 13
v = 13
for i in range(1, 3):
    temp = pd.read_csv('ACSNano_13_' + str(i) + '.csv', encoding = 'unicode_escape')
    temp['Year'] = v + 2006
    temp['Issue'] = v
    temp['Volume'] = i
    b = b.append(temp)
    
    
# rearrange the columns
b = b[['Title', 'Abstract', 'Keywords', 'Year', 'Volume', 'Issue']]

# remove nans
b=b.dropna()

# cut the extra from the title
b['Title']=b['Title'].apply(lambda x:x[:-30])

# write to csv
b.to_csv('ACSNano_2014_2019.csv',index=False)

First attempts at interpretting the data (to be removed later)

In [66]:
# load in the data
temp = pd.read_csv('ACSNano_2014_2019.csv', encoding = 'unicode_escape')

# extract the keywords
keywords = temp['Keywords'].str.split(', ')
tempkeywords = keywords.tolist()

# make a single list with all the keywords
listofkeywords = []
for i in tempkeywords:
    listofkeywords += i

In [150]:
from collections import Counter

print('Average number of appearances per keyword: ' + str(len(listofkeywords) / len(Counter(listofkeywords))))

Counter(listofkeywords).most_common(1000)

Average number of appearances per keyword: 1.9862439188055696


[('graphene', 394),
 ('self-assembly', 358),
 ('nanoparticles', 206),
 ('scanning tunneling microscopy', 146),
 ('MoS2', 114),
 ('photoluminescence', 114),
 ('nanoparticle', 113),
 ('density functional theory', 112),
 ('carbon nanotubes', 103),
 ('drug delivery', 100),
 ('graphene oxide', 96),
 ('2D materials', 96),
 ('quantum dots', 94),
 ('gold nanoparticles', 91),
 ('chemical vapor deposition', 91),
 ('transition metal dichalcogenides', 85),
 ('carbon nanotube', 80),
 ('two-dimensional materials', 78),
 ('plasmonics', 77),
 ('nanomedicine', 75),
 ('nanocrystals', 72),
 ('atomic force microscopy', 72),
 ('photothermal therapy', 70),
 ('molecular dynamics', 67),
 ('molybdenum disulfide', 67),
 ('quantum dot', 66),
 ('triboelectric nanogenerator', 64),
 ('nanopore', 55),
 ('photodynamic therapy', 55),
 ('DNA nanotechnology', 55),
 ('doping', 54),
 ('perovskite', 51),
 ('field-effect transistor', 50),
 ('Raman spectroscopy', 49),
 ('nanowires', 47),
 ('DNA', 43),
 ('DNA origami', 42),
 

In [102]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorize the text data
vec = TfidfVectorizer()
X = vec.fit_transform(temp['Abstract'])

# convert to a dataframe
total_data = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())

In [143]:
zz = temp['Keywords'].str.contains('graphene')

In [104]:
from sklearn.ensemble import RandomForestClassifier

graphene = RandomForestClassifier(random_state = 0)

graphene.fit(total_data.as_matrix(), zz)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)