In [1]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [None]:
# Fetch page
url_origin = 'https://www.glyphweb.com/esky/stars/'
response = req.get(url=url_origin)

# Parse the HTML content
soup = BeautifulSoup(response.text, features='lxml')
star_items = soup.find_all(attrs={'class':'index-item'}) # List of <\> with class='index-item'

In [9]:
def star_dataframe(star_items):
    n_stars = len(star_items)
    stars_dict = {'name': [], 'url': [], 'visible_ini': [], 'visible_end': []}

    for idx,item in enumerate(star_items, 1):
        # Progress
        print(f'Processing star {idx} of {n_stars}... (Estimated time {round((n_stars-idx)*3/60, 0)} minutes)  ', end='\r') 
        
        # Skip if name is already in the list
        check_name = item.get_text(strip=True)
        if check_name in stars_dict['name']:
            continue 
        
        # Extract info
        if item.find('a'):
            # Star url for more info (multiple names, visibility...)
            url_star = url_origin + item.find('a')['href']

            # Request info
            res = req.get(url=url_star)
            time.sleep(3) # Wait a bit after request

            # Parse the HTML content
            s = BeautifulSoup(res.text, features='lxml')
            star_info = s.find_all(attrs={'class': 'factboxValue'})

            try: # Structured info (e.g. https://www.glyphweb.com/esky/stars/acamar.html)
                # Name and visibility months
                names = star_info[0].get_text(strip=True)
                visible = star_info[11].get_text(strip=True)

                # Clear text: remove extra info in parenthesis and white spaces
                names = re.sub(pattern=r'\s+\(.+\)', repl='', string=names)
                visible = re.sub(pattern=r'\s+\(.+\)', repl='', string=visible)

                # Split visibility (if can be seen in multiple months) by ' / '
                visible = visible.split(' / ')
                visible_ini = visible[0]
                if len(visible) > 1:
                    visible_end = visible[1]
                else:
                    visible_end = visible_ini
                
                # Split names (if multiple) by ', '
                names = names.split(', ')

                # Store info into dictionary without duplicates
                for name in names:
                    if name not in stars_dict['name']:
                        stars_dict['name'].append(name)
                        stars_dict['url'].append(url_star)
                        stars_dict['visible_ini'].append(visible_ini)
                        stars_dict['visible_end'].append(visible_end)

            except: # Missing structured info (e.g. https://www.glyphweb.com/esky/stars/acubens.html)
                stars_dict['name'].append(check_name)
                stars_dict['url'].append(url_star)
                stars_dict['visible_ini'].append(pd.NA)
                stars_dict['visible_end'].append(pd.NA)

    return pd.DataFrame(stars_dict)

stars_df = star_dataframe(star_items)


Processing star 1313 of 1313... (Estimated time 0.0 minutes)   

In [None]:
display(stars_df.head(5))
stars_df.to_csv('data/stars_df.csv')

Unnamed: 0,name,url,visible_ini,visible_end
0,Acamar,https://www.glyphweb.com/esky/stars/acamar.html,November,November
1,Aldulfin,https://www.glyphweb.com/esky/stars/aldulfin.html,August,August
2,Deneb Dulfim,https://www.glyphweb.com/esky/stars/aldulfin.html,August,August
3,Alrescha,https://www.glyphweb.com/esky/stars/alrisha.html,November,November
4,Alrisha,https://www.glyphweb.com/esky/stars/alrisha.html,November,November
