Importing some basic libraries

In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

Function to scrape data from Nature Communications

In [2]:
def scrapeNatureComm(subject,num_articles):
    # for a given subject (physical sciences, biological sciences, earth and environmental sciences, or health sciences)
    # extract the article summaries for the most recent num_articles
    
    
    # get html code from first page
    url = 'https://www.nature.com/subjects/' + subject + '/ncomms'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # w contains the classes of the text description of the articles 
    w = ['mt4', 'serif', 'text13', 'tighten-line-height', 'text-gray', 'suppress-bottom-margin', 'hide-overflow', 'inline']
    
    # an array to contain all the text descriptions
    summary = []
    
    # add the relavent data to the array
    a = soup.find_all('div')
    for i in a:
        if i['class'] == w:
            summary.append(i.text)
    
    # check if we have enough data
    if len(summary) >= num_articles:
        return summary[:num_articles]
    
    # start scraping pages
    page_number = 2
       
    # loop to keep scraping until we reach the appropriate number of articles
    while len(summary) <= num_articles:
        
        # get html code
        url = 'https://www.nature.com/subjects/' + subject + '/ncomms?searchType=journalSearch&sort=PubDate&page=' + str(page_number)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # extract the relavent data
        tempa = soup.find_all('div')
        page_number += 1
        for i in a:
            if i['class'] == w:
                summary.append(i.text)
                
    return summary[:num_articles]

Scrape physical science data

In [3]:
physical = scrapeNatureComm('physical-sciences',1000)

Scrape biological science data

In [4]:
bio = scrapeNatureComm('biological-sciences',1000)

Scrape earth and environmental science data

In [5]:
enviro = scrapeNatureComm('earth-and-environmental-sciences',1000)

Scrape health science data

In [6]:
health = scrapeNatureComm('health-sciences',1000)

Many articles have multiple classifications, so we need to remove duplicates and mark that they are have those classifications.

In [7]:
# all the physical articles are physical

target_physical = [1] * len(physical)

# by default, we'll assume that the other articles are not physical
target_bio = [0] * len(physical)
target_enviro = [0] * len(physical)
target_health = [0] * len(physical)

# check if a physical article has another id, if so remove that redundancy and mark that it has another id
for i in range(len(physical)):

    for j in range(len(bio)):
        if physical[i] == bio[j]:
            target_bio[i] = 1
            del bio[j]
            break
            
    for j in range(len(enviro)):
        if physical[i] == enviro[j]:
            target_enviro[i] = 1
            del enviro[j]
            break
            
    for j in range(len(health)):
        if physical[i] == health[j]:
            target_health[i] = 1
            del health[j]
            break

In [8]:
# repeat it for another classification

target_physical += [0]*len(bio)
target_bio += [1]*len(bio)
target_enviro += [0]*len(bio)
target_health += [0]*len(bio)

for i in range(len(bio)):
    
    for j in range(len(enviro)):
        if bio[i] == enviro[j]:
            target_enviro[i+1000] = 1
            del enviro[j]
            break
            
    for j in range(len(health)):
        if bio[i] == health[j]:
            target_health[i+1000] = 1
            del health[j]
            break

In [9]:
# and finally

target_physical += [0]*len(enviro)
target_bio += [0]*len(enviro)
target_enviro += [1]*len(enviro)
target_health += [0]*len(enviro)

for i in range(len(enviro)):
            
    for j in range(len(health)):
        if enviro[i] == health[j]:
            target_health[i+1000+len(bio)] = 1
            del health[j]
            break
            
target_physical += [0]*len(health)
target_bio += [0]*len(health)
target_enviro += [0]*len(health)
target_health += [1]*len(health)

Now that we've collected the data now we can process it

In [10]:
# import packages

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
combined = physical + bio + enviro + health

# vectorize the text data
vec = TfidfVectorizer()
X = vec.fit_transform(combined)

# convert to a dataframe
total_data = pd.DataFrame(X.toarray(),columns = vec.get_feature_names())

Now we'll fit it to a Gaussian naive Bayes classifier. We'll consider each classification separately, and look at how the model does from cross validation

In [11]:
# importing a few relevant sklearn packages

from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import GaussianNB

# evaluating cross validation
print('Cross validation performance:')
print('Physical sciences model: ' + str(cross_val_score(GaussianNB(),total_data.as_matrix(),(target_physical),cv = 5)))
print('Biological sciences model: ' + str(cross_val_score(GaussianNB(),total_data.as_matrix(),(target_bio),cv = 5)))
print('Earth and Enviromental sciences model: ' + str(cross_val_score(GaussianNB(),total_data.as_matrix(),(target_enviro),cv = 5)))
print('Health sciences model: ' + str(cross_val_score(GaussianNB(),total_data.as_matrix(),(target_health),cv = 5)))



Cross validation performance:
Physical sciences model: [ 1.          0.97641509  1.          0.99055118  1.        ]
Biological sciences model: [ 1.  1.  1.  1.  1.]
Earth and Enviromental sciences model: [ 0.96698113  1.          1.          1.          1.        ]
Health sciences model: [ 0.93710692  0.96855346  1.          0.98110236  0.9984252 ]


Now let's actually fit the models

In [12]:
phys_model = GaussianNB()
phys_model.fit(total_data.as_matrix(),target_physical)

bio_model = GaussianNB()
bio_model.fit(total_data.as_matrix(),target_bio)

enviro_model = GaussianNB()
enviro_model.fit(total_data.as_matrix(),target_enviro)

health_model = GaussianNB()
health_model.fit(total_data.as_matrix(),target_health)

# combining them into a list
models = [phys_model,bio_model,enviro_model,health_model]

Finally, I wrote a function that will evaluate a string of text and see which scientific fields it might relate to.

In [13]:
def PredictScientificField(text,models,vec):
    # given a string, the list of models generated, and the vectorizer used to generate the models
    # it returns a string of the relavent scientific fields
    
    # vectorize input string
    vector_text = vec.transform([text]).toarray()
    
    # initialize output string, notable it has a length of 29 so, if it's longer something has been added to it
    output = 'Predicted Scientific Fields: '
    
    # the actual model fitting, commas are included if there is more than one field
    if models[0].predict(vector_text) == 1:
        output += 'Physical'
        
    if models[1].predict(vector_text) == 1:
        if len(output) > 29:
            output += ', Biological'
        else:
            output += 'Biological'
            
    if models[2].predict(vector_text) == 1:
        if len(output) > 29:
            output += ', Eath/Environmental' 
        else:
            output += 'Eath/Environmental'
            
    if models[3].predict(vector_text) == 1:
        if len(output) > 29:
            output += ', Health'
        else:
            output += 'Health'
    
    # if no fields have been identified, it says none
    if len(output) == 29:
        output += 'None'
    
    # print and return the output
    print(output)
    return output

Here's an example of the function at work

In [14]:
PredictScientificField('Rocks, climate change, cells ',models,vec);

Predicted Scientific Fields: Biological, Eath/Environmental
