In [17]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time

In [2]:
# Fetch page
url_origin = 'https://www.glyphweb.com/esky/stars/'
response = req.get(url=url_origin)

# Parse the HTML content
soup = BeautifulSoup(response.text, features='lxml')
star_items = soup.find_all(attrs={'class':'index-item'}) # List of <\> with class='index-item'

In [68]:
def get_optimum_visibility_months(soup):
    # Get sections and their corresponding information
    star_sections = soup.find_all(attrs={'class': 'factboxCaption'})
    star_info = soup.find_all(attrs={'class': 'factboxValue'})

    # Check if optimum visibility is available
    for section, info in zip(star_sections, star_info):
        if section.get_text() == 'Optimum Visibility':
            # Get months of optimum visibility
            months = info.get_text()
            
            # Remove additional information in parenthesis
            months = re.sub(pattern=r'\(.+\)', repl='', string=months)

            # Separate months into list elements
            months = months.split('/')

            # Remove additional whitespaces
            months = [month.strip() for month in months]

            # If only one month, return it duplicated (same initial and end month)
            if len(months) == 1:
                months.extend(months)
            
            return months
        
def get_star_names(soup):
    # Names are the first section's info
    names = soup.find_all(attrs={'class': 'factboxValue'})[0].get_text(strip=True)

    # Remove additional information in parenthesis
    names = re.sub(pattern=r'\(.+\)', repl='', string=names)

    # Separate names into list elements
    names = names.split(',')

    # Remove additional whitespaces and return
    return [name.strip() for name in names]

In [70]:
def star_dataframe(star_items):
    sleep_time = 5
    n_stars = len(star_items)
    stars_dict = {'name': [], 'url': [], 'visible_ini': [], 'visible_end': []}

    for idx,item in enumerate(star_items, 1):
        # Skip if name is already in the list
        check_name = item.get_text(strip=True)
        if check_name in stars_dict['name']:
            continue 

        # Track progress
        print(f'Processing star {idx} of {n_stars}...' +
               f'(Estimated time {round((n_stars-idx)*sleep_time/60, 0)} minutes)  ', end='\r') 
        
        # Extract info
        if item.find('a'):
            # Star url for more info (multiple names, visibility...)
            url_star = url_origin + item.find('a')['href']

            # Request info
            res = req.get(url=url_star)
            time.sleep(sleep_time) # Wait a bit after request

            # Parse the HTML content
            s = BeautifulSoup(res.text, features='lxml')

            try: # Structured info (e.g. https://www.glyphweb.com/esky/stars/acamar.html)
                names = get_star_names(s) # Get all names for this star
                [visible_ini, visible_end] = get_optimum_visibility_months(s) # Month of Optimum Visibility

                # Store info into dictionary without duplicates
                for name in names:
                    if name not in stars_dict['name']:
                        stars_dict['name'].append(name)
                        stars_dict['url'].append(url_star)
                        stars_dict['visible_ini'].append(visible_ini)
                        stars_dict['visible_end'].append(visible_end)

            except: # Missing structured info (e.g. https://www.glyphweb.com/esky/stars/acubens.html)
                stars_dict['name'].append(check_name)
                stars_dict['url'].append(url_star)
                stars_dict['visible_ini'].append('Not visible')
                stars_dict['visible_end'].append('Not visible')

    return pd.DataFrame(stars_dict)

# Save stars info in a csv file
stars_df = star_dataframe(star_items)
display(stars_df.head(5))
stars_df.to_csv('data/stars_df.csv')

Processing star 1306 of 1313...(Estimated time 1.0 minutes)   

Unnamed: 0,name,url,visible_ini,visible_end
0,Acamar,https://www.glyphweb.com/esky/stars/acamar.html,November,November
1,Aldulfin,https://www.glyphweb.com/esky/stars/aldulfin.html,August,August
2,Deneb Dulfim,https://www.glyphweb.com/esky/stars/aldulfin.html,August,August
3,Alrescha,https://www.glyphweb.com/esky/stars/alrisha.html,November,November
4,Alrisha,https://www.glyphweb.com/esky/stars/alrisha.html,November,November
