In [34]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [35]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [24]:
# Define database and collection
db = client.nhl_db
collection = db.articles

In [25]:
# URL of page to be scraped
url = 'https://www.nhl.com/news'

In [26]:
# Retrieve page with the requests module
response = requests.get(url)

In [27]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [28]:
results = soup.find_all('div', class_='article-item__top')

In [29]:
len(results)

30

In [30]:
results[0]

<div class="article-item__top">
<span class="article-item__primary-tag" data-type="program" data-value="31in31">31 in 31</span>
<span class="article-item__game-data"></span>
<h1 class="article-item__headline">Inside look at Detroit Red Wings</h1>
<h2 class="article-item__subheader">Hope roster shuffle leads to improvement while top prospects develop</h2>
<span class="article-item__contributor">
                                        by
                                            Nicholas J. Cotsonika
                                            <a href="https://www.twitter.com/@cotsonika" target="_blank">@cotsonika</a>
                                            
                                        / NHL.com Columnist
                                </span>
<div class="article-item__meta">
<i class="nhl-icon nhl-icon--clock-outline"></i>
<span class="article-item__date" data-date="2020-11-26T00:00:00-0500"></span>
<span class="article-item__share"></span>
<div class="social-share__

In [31]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='article-item__top')

# loop over results to get article data
for result in results:
    # scrape the article header 
    header = result.find('h1', class_='article-item__headline').text
    
    # scrape the article subheader
    subheader = result.find('h2', class_='article-item__subheader').text
    
    # scrape the datetime
    # Class Note: Using the hard bracket because the date is metadata, not just text
    datetime = result.find('span', class_='article-item__date')['data-date']

    # get only the date from the datetime
    date = datetime.split('T')[0]
    
    # print article data
    print('-----------------')
    print(header)
    print(subheader)
    print(date)

    # Dictionary to be inserted into MongoDB
    post = {
        'header': header,
        'subheader': subheader,
        'date': date
    }


    # Insert dictionary into MongoDB as a document
    collection.insert_one(post)


-----------------
Inside look at Detroit Red Wings
Hope roster shuffle leads to improvement while top prospects develop
2020-11-26
-----------------
Canada World Junior selection camp paused by positive coronavirus tests
Two players with COVID-19; team, coaches enter 14-day quarantine
2020-11-25
-----------------
Boychuk of Islanders ending playing career because of eye injury
Defenseman needed 90 stitches to close cut to eyelid in March
2020-11-25
-----------------
Sergachev signs three-year, $14.4 million contract with Lightning
Restricted free agent defenseman scored 34 points last season, helped Tampa Bay win Cup
2020-11-25
-----------------
Top prospects for Detroit Red Wings
Defenseman Seider, forward Zadina could contribute this season
2020-11-26
-----------------
Reverse Retro alternate jerseys for all 31 teams unveiled by NHL, adidas
Will be worn multiple times this season; available for purchase starting Dec. 1
2020-11-16
-----------------
Mailbag: Kraken's chances for expans

In [33]:
# Display the MongoDB records created above
articles = db.articles.find()
for article in articles:
    print(article)

{'_id': ObjectId('5fb94dd7db0e4d869b314313'), 'header': 'Inside look at Carolina Hurricanes', 'subheader': 'Relying on maturing core, recent playoff exprience to make deep run this season', 'date': '2020-11-21'}
{'_id': ObjectId('5fb94dd7db0e4d869b314314'), 'header': 'Top goalie in NHL in 3 seasons debated', 'subheader': 'Shesterkin, Vasilevskiy, Hellebuyck, Hart among favorites of NHL.com writers', 'date': '2020-11-21'}
{'_id': ObjectId('5fb94dd7db0e4d869b314315'), 'header': 'Three questions facing Carolina Hurricanes', 'subheader': "Mrazek, Reimer potential in goal, Hamilton's expiring contract among concerns", 'date': '2020-11-21'}
{'_id': ObjectId('5fb94dd7db0e4d869b314316'), 'header': 'Top prospects for Carolina Hurricanes', 'subheader': 'Bean could make opening roster; Bokk to play in North America', 'date': '2020-11-21'}
{'_id': ObjectId('5fb94dd7db0e4d869b314317'), 'header': 'Thornton will spark Maple Leafs, GM of Switzerland team says: report', 'subheader': '41-year-old center