In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.nhl_db
collection = db.articles

In [4]:
# URL of page to be scraped
url = 'https://www.nhl.com/'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
# Retrieve the parent divs for all articles
results = soup.find_all('li', class_='mixed-feed__item--article')

# Loop through results to retrieve article title, header, and timestamp of article
for result in results:
    title = result.find('h4', class_='mixed-feed__header').text

    lede = result.find('h5', class_='mixed-feed__subheader').text

    # The time and date of article publication
    date = result.find('time')['datetime']
    # Slice the datetime string for the date
    article_date = date[:10]
    # Slice the datetime string for the time
    time = date[11:16]
    # Determine whether article was published in AM or PM
    if (int(time[:2]) >= 13):
        meridiem = 'pm'
    else:
        meridiem = 'am'

    # Concatenate time string
    time = time + meridiem
    print('-----------------')
    print(title)
    print(lede)
    print(article_date)
    print(time)

    # Dictionary to be inserted into MongoDB
    post = {
        'title': title,
        'lede': lede,
        'date': article_date,
        'time published': time
    }

    # Insert dictionary into MongoDB as a document
    collection.insert_one(post)

-----------------
New Hurricanes owner commits to Raleigh market
Dundon eager to get started, wants to enhance fan experience
2018-01-12
19:37pm
-----------------
Cogliano signs three-year contract with Ducks
30-year-old forward could have become unrestricted free agent July 1
2018-01-12
15:11pm
-----------------
Penguins goalie Murray goes home because of family matter
DeSmith recalled from minors; Pittsburgh returns from five-day break Saturday
2018-01-12
16:05pm
-----------------
NHL.com picks lines for All-Star Game
Staff writers decide how talent should be deployed for midseason showcase
2018-01-11
12:25am
-----------------
New smell test emerges for goaltenders
Burning rubber becomes more common odor in harder-shot era
2018-01-12
00:00am
-----------------
NHL schedule: Bye weeks for 2017-18 season
Complete list of mandated five-day breaks
2018-01-12
00:00am
-----------------
Fantasy buzz: Impact of Hedman injury on Lightning
Sergachev, Stralman gain value with elite defenseman ou

In [6]:
# Display the MongoDB records created above
articles = db.articles.find()
for article in articles:
    print(article)

{'_id': ObjectId('5a5968b7b1180204cca6554e'), 'title': 'New Hurricanes owner commits to Raleigh market', 'lede': 'Dundon eager to get started, wants to enhance fan experience', 'date': '2018-01-12', 'time published': '19:37pm'}
{'_id': ObjectId('5a5968b7b1180204cca6554f'), 'title': 'Cogliano signs three-year contract with Ducks', 'lede': '30-year-old forward could have become unrestricted free agent July 1', 'date': '2018-01-12', 'time published': '15:11pm'}
{'_id': ObjectId('5a5968b7b1180204cca65550'), 'title': 'Penguins goalie Murray goes home because of family matter', 'lede': 'DeSmith recalled from minors; Pittsburgh returns from five-day break Saturday', 'date': '2018-01-12', 'time published': '16:05pm'}
{'_id': ObjectId('5a5968b7b1180204cca65551'), 'title': 'NHL.com picks lines for All-Star Game', 'lede': 'Staff writers decide how talent should be deployed for midseason showcase', 'date': '2018-01-11', 'time published': '12:25am'}
{'_id': ObjectId('5a5968b7b1180204cca65552'), 'ti