In [1]:
# Project 1
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# This frame uses get to obtain the HTML file and printed it so we can see what we were working with
res = requests.get("https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares.html")
soup = BeautifulSoup(res.content, "html.parser")
# print (soup.prettify)

In [3]:
# This frame finds the table and loads it into pandas for formatting

# Find the table 
table = soup.find('table')

# Load it into pandas making sure all formatted as string
data = pd.read_html(str(table))
data = data[0]
data.rename(columns={'Unnamed: 0': 'Rank', 'Unnamed: 1': 'X_class', 'Unnamed: 2': 'Date', 'Region': 'Region', 
                       'Start': 'Start', 'Maximum': 'Max', 'End':'End'}, inplace=True)
# Printed status code bc Kernel was being dumb with requests. Sometimes throws 403 and have to wait it out
print(res.status_code)
data
# End of part1

403


Unnamed: 0,Reference ID,IP Address,Date and Time
0,fa47ac0d1c9b7fcfa402060100d2f18b,24.0.251.35,02/19/2023 05:27 PM UTC


In [4]:
# Drop the extra movie column
data = data.drop(columns = 'Unnamed: 7')

# To format into datetime, combine the date along with each time column 
data['Start_DT'] = pd.to_datetime(data['Date'] + ' ' + data['Start'])
data['Max_DT'] = pd.to_datetime(data['Date'] + ' ' + data['Max'])
data['End_DT'] = pd.to_datetime(data['Date'] + ' ' + data['End'])

KeyError: "['Unnamed: 7'] not found in axis"

In [None]:
# drop the uneccessary non-datetime columns
data = data.drop(columns=['Date', 'Start', 'Max', 'End'])

# Reorganized columns to match project description output 
data = data[['Rank', 'X_class', 'Start_DT', 'Max_DT', 'End_DT', 'Region']]
# Replace all missing data with NaN
data = data.replace('-', 'NaN')
data
# End of part 2

In [None]:
# Start of part 3. Use the same process at part1 to start
res = requests.get("https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html")
soup = BeautifulSoup(res.content, "html.parser")
# print (soup.prettify)

In [None]:
# This link does NOT have a marked table. Must extract as text instead
nasa_tab = soup.find('pre')
txt = nasa_tab.get_text()
# txt

In [None]:
# Since each row of data appears on a new line, split this string by line
rows = txt.split('\n')

# Now remove the unnecessary beginning and ending lines
for i in range(12):
    rows.pop(0);
for i in range(2):
    rows.pop(len(rows) - 1)

# Create a table with the proper columns 
nasa_data = pd.DataFrame(columns=['Start_Date', 'Start_Time', 'End_Date', 'End_Time', 'Start_Freq', 
'End_Freq', 'Flare_Loc', 'Flare_Region','Flare_Class', 'CME_Date', 
'CME_Time', 'CME_Angle', 'CME_Width', 'CME_Speed', 'Plot'], index = range(0,len(rows)))

# Now load each line into the newly created dataframe
row_index = 0
for temp in rows:
    row = temp.split(' ')
    while '' in row:
        row.remove('')
    col_index = 0
    while col_index < 15:
        nasa_data.iat[row_index, col_index] = row[col_index]
        col_index += 1
    row_index += 1
nasa_data = nasa_data.drop(columns = 'Plot')
nasa_data
# End of part 3

In [None]:
# Start by recoding the dashes as NaN
nasa_data = nasa_data.replace('----', 'NaN')
nasa_data = nasa_data.replace('-----', 'NaN')
nasa_data = nasa_data.replace('????', 'NaN')

# Create new halo column and fill it with booleans checked against CME angles
nasa_data['Halo'] = nasa_data['CME_Angle'].map(lambda x: x == 'Halo')

# Replace Halos with NA in CME angle column
nasa_data = nasa_data.replace('Halo', "NA")

# Removing the > from the width columns and instead adding a col that determines lower bound
nasa_data['Lower_Bound'] = nasa_data['CME_Width'].map(lambda x: str(x).__contains__('>'))
nasa_data['CME_Width'] = nasa_data['CME_Width'].map(lambda x: x if str(x)[0] != '>' else str(x)[1:])

# Adjust rows to allow datetime formatting
nasa_data['Start_DT'] = pd.to_datetime(nasa_data['Start_Date'] + ' ' + nasa_data['Start_Time'])
# Prepend a year to the end dates 
for i in range(len(nasa_data.index)):
    temp = nasa_data.values[i][0].split('/')
    year = str(temp[0]) + '/'
    nasa_data.iat[i, 2] = year + nasa_data.iat[i, 2]
nasa_data

# Set the CME and End columns to datetime as well
nasa_data['End_Time'] = nasa_data['End_Time'].replace('24:00', '0:00')
nasa_data['End_DT'] = pd.to_datetime(nasa_data['End_Date'] + ' ' + nasa_data['End_Time'])
nasa_data
# for i in range(len(nasa_data.index)):
    # if nasa_data.iat[i,9] != 'NaN' and nasa_data.iat[i,10] != 'NaN':
       #nasa_data.iat[i, 19] = pd.to_datetime(nasa_data.iat[i,9] + ' ' + nasa_data.iat[i,10])

In [None]:
# Isolate rows with max intensity flares (only 50. Also technically already sorted at this point)
data = data.loc[data['X_class'].str.contains('X')]

# Remove X from classifications and sort as float
data['X_class'] = data['X_class'].str.lstrip('X')
data['X_class'] = data['X_class'].str.rstrip('+')
data['X_class'] = data.X_class.astype(float)
data = data.head(50).sort_values('X_class', ascending= False)
data