# Lunar Phase Web-scrape
This script scrapes the following source for a variety of daily lunar phase data and stores in them in a Pandas DataFrame.

Creation Date: 2/12/2020

Author: Drew McKinney

Scrape Source: https://lunaf.com/lunar-calendar/

Scraped Objects:
1. Date
2. Lunar Phase
3. Illumination Percent
3. Lunar Sign

In [28]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from splinter import Browser


### DEV TOOLS ###
start_date_less1 = '2018/12/31'
weeks =52


# URL of page to be scraped
url = f'https://lunaf.com/lunar-calendar/{start_date_less1}/#next-7-days-moon-phases'

# creating splinter browser and visint url
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)
print('Visiting URL')
browser.visit(url)

# creating empty dataframe
lunar_df = pd.DataFrame(columns = {
    'time',
    'phase',
    'illumination_percent',
    'sign'
})

print('''
--Starting Scrape--
''')

# creating scrape performance metrics
success = 0
fail = 0


# looping through pages for indicated amount of weeks
for week in np.arange(0, weeks):
    
    try: 
        # activating splinter browser
        html = browser.html

        # Create BeautifulSoup object; parse with 'html.parser'
        soup = BeautifulSoup(html, 'html.parser')

        # html parsing articles on page
        articles = soup.find_all('article')

        # looping through article list
        for article in articles:

            # locating links in articles
            links = article.find_all('a')

            # looping through list of links
            for index, item in enumerate(links):

                # extracting description in link tag
                desc = item['title']

                # parsing out all words in description (appears to use a template to create desc)
                desc_items = desc.split(' ')

                # creating phase item
                phase = desc_items[0] + ' ' + desc_items[1]

                # creating illumination item
                ill = desc_items[3][:-1]

                # creating sign item
                sign = desc_items[8]

                # parsing datetime item from time tag
                time = item.find_all('time')[index]['datetime']

                lunar_df = lunar_df.append({'time':time, 'phase':phase, 'illumination_percent':ill, 'sign':sign}, ignore_index=True)

        # outputting query status to terminal
        print(f'Completed Week:{week + 1} of {weeks}')
        success += 1

    except:
        print(f'Failed Week: {week + 1} of {weeks}')
        fail += 1
        
    # navigating to next page of next week
    browser.click_link_by_partial_text('7 days after')


print(f'''
--Scrape Completed--
Succeses: {success}
Fails: {fail}
''')

Visiting URL

--Starting Scrape--

Completed Week:1 of 52
Completed Week:2 of 52
Completed Week:3 of 52
Completed Week:4 of 52
Completed Week:5 of 52
Completed Week:6 of 52
Completed Week:7 of 52
Completed Week:8 of 52
Completed Week:9 of 52
Completed Week:10 of 52
Completed Week:11 of 52
Completed Week:12 of 52
Completed Week:13 of 52
Completed Week:14 of 52
Completed Week:15 of 52
Completed Week:16 of 52
Completed Week:17 of 52
Completed Week:18 of 52
Completed Week:19 of 52
Completed Week:20 of 52
Completed Week:21 of 52
Completed Week:22 of 52
Completed Week:23 of 52
Completed Week:24 of 52
Completed Week:25 of 52
Completed Week:26 of 52
Completed Week:27 of 52
Completed Week:28 of 52
Completed Week:29 of 52
Completed Week:30 of 52
Completed Week:31 of 52
Completed Week:32 of 52
Completed Week:33 of 52
Completed Week:34 of 52
Completed Week:35 of 52
Completed Week:36 of 52
Completed Week:37 of 52
Completed Week:38 of 52
Completed Week:39 of 52
Completed Week:40 of 52
Completed Week

In [30]:
lunar_df.sort_values('time')

Unnamed: 0,sign,illumination_percent,phase,time
0,Scorpio,19,Waning Crescent,2019-01-01
1,Sagittarius,12,Waning Crescent,2019-01-02
2,Sagittarius,6,Waning Crescent,2019-01-03
3,Sagittarius,2,Waning Crescent,2019-01-04
4,Capricorn,0,New Moon,2019-01-05
...,...,...,...,...
359,Capricorn,0,New Moon,2019-12-26T05:13:00+00:00
360,Capricorn,2,Waxing Crescent,2019-12-27
361,Aquarius,6,Waxing Crescent,2019-12-28
362,Aquarius,11,Waxing Crescent,2019-12-29


In [38]:
lunar_df.describe()

Unnamed: 0,sign,illumination_percent,phase,time
count,364,364,364,364
unique,12,98,8,364
top,Aquarius,100,Waxing Crescent,2019-07-08
freq,36,17,77,1


In [39]:
lunar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 4 columns):
sign                    364 non-null object
illumination_percent    364 non-null object
phase                   364 non-null object
time                    364 non-null object
dtypes: object(4)
memory usage: 11.5+ KB
