In [211]:
# Project 1
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [212]:
# This frame uses get to obtain the HTML file and printed it so we can see what we were working with
# Headers to make sure requests actually works every time
header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }
res = requests.get("https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares.html", headers = header)
print(res.status_code)
soup = BeautifulSoup(res.content, "html.parser")
# print (soup.prettify)

200


In [213]:
# This frame finds the table and loads it into pandas for formatting

# Find the table 
table = soup.find('table')

# Load it into pandas making sure all formatted as string
data = pd.read_html(str(table))
data = data[0]
data.rename(columns={'Unnamed: 0': 'Rank', 'Unnamed: 1': 'X_class', 'Unnamed: 2': 'Date', 'Region': 'Region', 
                       'Start': 'Start', 'Maximum': 'Max', 'End':'End'}, inplace=True)
# Printed status code bc Kernel was being dumb with requests. Sometimes throws 403 and have to wait it out
print(res.status_code)
data
# End of step 1

200


Unnamed: 0,Rank,X_class,Date,Region,Start,Max,End,Unnamed: 7
0,1,X28+,2003/11/04,486,19:29,19:53,20:06,MovieView archive
1,2,X20+,2001/04/02,9393,21:32,21:51,22:03,MovieView archive
2,3,X17.2+,2003/10/28,486,09:51,11:10,11:24,MovieView archive
3,4,X17+,2005/09/07,808,17:17,17:40,18:03,MovieView archive
4,5,X14.4,2001/04/15,9415,13:19,13:50,13:55,MovieView archive
5,6,X10,2003/10/29,486,20:37,20:49,21:01,MovieView archive
6,7,X9.4,1997/11/06,8100,11:49,11:55,12:01,MovieView archive
7,8,X9.3,2017/09/06,2673,11:53,12:02,12:10,MovieView archive
8,9,X9,2006/12/05,930,10:18,10:35,10:45,MovieView archive
9,10,X8.3,2003/11/02,486,17:03,17:25,17:39,MovieView archive


In [214]:
# Drop the extra movie column
data = data.drop(columns = 'Unnamed: 7')

# To format into datetime, combine the date along with each time column 
data['Start_DT'] = pd.to_datetime(data['Date'] + ' ' + data['Start'])
data['Max_DT'] = pd.to_datetime(data['Date'] + ' ' + data['Max'])
data['End_DT'] = pd.to_datetime(data['Date'] + ' ' + data['End'])

In [215]:
# drop the uneccessary non-datetime columns
data = data.drop(columns=['Date', 'Start', 'Max', 'End'])

# Reorganized columns to match project description output 
data = data[['Rank', 'X_class', 'Start_DT', 'Max_DT', 'End_DT', 'Region']]
# Replace all missing data with NaN
data = data.replace('-', 'NaN')
data
# End of step 2

Unnamed: 0,Rank,X_class,Start_DT,Max_DT,End_DT,Region
0,1,X28+,2003-11-04 19:29:00,2003-11-04 19:53:00,2003-11-04 20:06:00,486
1,2,X20+,2001-04-02 21:32:00,2001-04-02 21:51:00,2001-04-02 22:03:00,9393
2,3,X17.2+,2003-10-28 09:51:00,2003-10-28 11:10:00,2003-10-28 11:24:00,486
3,4,X17+,2005-09-07 17:17:00,2005-09-07 17:40:00,2005-09-07 18:03:00,808
4,5,X14.4,2001-04-15 13:19:00,2001-04-15 13:50:00,2001-04-15 13:55:00,9415
5,6,X10,2003-10-29 20:37:00,2003-10-29 20:49:00,2003-10-29 21:01:00,486
6,7,X9.4,1997-11-06 11:49:00,1997-11-06 11:55:00,1997-11-06 12:01:00,8100
7,8,X9.3,2017-09-06 11:53:00,2017-09-06 12:02:00,2017-09-06 12:10:00,2673
8,9,X9,2006-12-05 10:18:00,2006-12-05 10:35:00,2006-12-05 10:45:00,930
9,10,X8.3,2003-11-02 17:03:00,2003-11-02 17:25:00,2003-11-02 17:39:00,486


In [216]:
# Start of part 3. Use the same process at part1 to start
res = requests.get("https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html")
soup = BeautifulSoup(res.content, "html.parser")
# print (soup.prettify)

In [217]:
# This link does NOT have a marked table. Must extract as text instead
nasa_tab = soup.find('pre')
txt = nasa_tab.get_text()
# txt

In [218]:
# Since each row of data appears on a new line, split this string by line
rows = txt.split('\n')

# Now remove the unnecessary beginning and ending lines
for i in range(12):
    rows.pop(0);
for i in range(2):
    rows.pop(len(rows) - 1)

# Create a table with the proper columns 
nasa_data = pd.DataFrame(columns=['Start_Date', 'Start_Time', 'End_Date', 'End_Time', 'Start_Freq', 
'End_Freq', 'Flare_Loc', 'Flare_Region','Flare_Class', 'CME_Date', 
'CME_Time', 'CME_Angle', 'CME_Width', 'CME_Speed', 'Plot'], index = range(0,len(rows)))

# Now load each line into the newly created dataframe
row_index = 0
for temp in rows:
    row = temp.split(' ')
    while '' in row:
        row.remove('')
    col_index = 0
    while col_index < 15:
        nasa_data.iat[row_index, col_index] = row[col_index]
        col_index += 1
    row_index += 1
nasa_data
# End of step 3

Unnamed: 0,Start_Date,Start_Time,End_Date,End_Time,Start_Freq,End_Freq,Flare_Loc,Flare_Region,Flare_Class,CME_Date,CME_Time,CME_Angle,CME_Width,CME_Speed,Plot
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312,PHTX
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878,PHTX
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464,PHTX
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296,PHTX
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712,PHTX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,2017/09/17,11:45,09/17,12:35,16000,900,S08E170,-----,----,09/17,12:00,Halo,360,1385,PHTX
518,2017/10/18,05:48,10/18,12:40,16000,400,S06E123,-----,----,10/18,08:00,85,146,1001,PHTX
519,2019/05/03,23:52,05/04,00:16,13000,2300,N12E82,12740,C1.0,05/03,23:24,90,113,692,PHTX
520,2020/11/29,13:07,11/29,15:23,14000,850,S23E89,-----,M4.4,11/29,13:25,Halo,360,2077,----


In [219]:
# Start by recoding the dashes as NaN
nasa_data = nasa_data.replace('----', 'NaN')
nasa_data = nasa_data.replace('-----', 'NaN')
nasa_data = nasa_data.replace('????', 'NaN')
nasa_data = nasa_data.replace('--', 'NaN')
nasa_data = nasa_data.replace('--:--', 'NaN')
nasa_data = nasa_data.replace('--/--', 'NaN')

# Create new halo column and fill it with booleans checked against CME angles
nasa_data['Halo'] = nasa_data['CME_Angle'].map(lambda x: x == 'Halo')

# Replace Halos with NA in CME angle column
nasa_data = nasa_data.replace('Halo', "NA")

# Removing the > from the width columns and instead adding a col that determines lower bound
nasa_data['Lower_Bound'] = nasa_data['CME_Width'].map(lambda x: str(x).__contains__('>'))
nasa_data['CME_Width'] = nasa_data['CME_Width'].map(lambda x: x if str(x)[0] != '>' else str(x)[1:])

# Adjust rows to allow datetime formatting
nasa_data['Start_DT'] = pd.to_datetime(nasa_data['Start_Date'] + ' ' + nasa_data['Start_Time'])

# Prepend a year to the end dates and CME dates
for i in range(len(nasa_data.index)):
    temp = nasa_data.values[i][0].split('/')
    year = str(temp[0]) + '/'
    nasa_data.iat[i, 2] = year + nasa_data.iat[i, 2]
    nasa_data.iat[i,9] = year + nasa_data.iat[i,9]

# Set the end columns to datetime
nasa_data['End_Time'] = nasa_data['End_Time'].replace('24:00', '0:00')
nasa_data['End_DT'] = pd.to_datetime(nasa_data['End_Date'] + ' ' + nasa_data['End_Time'])

# Set the cme columns to datetime
nasa_data['CME_DT'] = ''
for i in range(len(nasa_data.index)):
    if nasa_data.iat[i,9] != 'NaN' and nasa_data.iat[i,10] != 'NaN':
       nasa_data.iat[i, 19] = pd.to_datetime(nasa_data.iat[i,9] + ' ' + nasa_data.iat[i,10])

# Drop and rearrange columns to match example 
nasa_data = nasa_data.drop(columns = ['Start_Date', 'Start_Time', 'End_Date', 'End_Time', 'CME_Date', 'CME_Time'])
nasa_data = nasa_data.rename(columns = {'Flare_Class': 'Importance', 'CME_Angle': 'Angle', 
                            'CME_Speed': 'Speed', 'CME_Width': 'Width'})
nasa_data = nasa_data[['Start_DT', 'End_DT', 'Start_Freq', 'End_Freq', 'Flare_Loc', 'Flare_Region', 
                       'Importance', 'CME_DT', 'Angle', 'Width', 'Speed', 'Plot', 'Halo', 'Lower_Bound']]
nasa_data
# End of step 4
# End of part 1

Unnamed: 0,Start_DT,End_DT,Start_Freq,End_Freq,Flare_Loc,Flare_Region,Importance,CME_DT,Angle,Width,Speed,Plot,Halo,Lower_Bound
0,1997-04-01 14:00:00,1997-04-01 14:15:00,8000,4000,S25E16,8026,M1.3,1997-04-01 15:18:00,74,79,312,PHTX,False,False
1,1997-04-07 14:30:00,1997-04-07 17:30:00,11000,1000,S28E19,8027,C6.8,1997-04-07 14:27:00,,360,878,PHTX,True,False
2,1997-05-12 05:15:00,1997-05-14 16:00:00,12000,80,N21W08,8038,C1.3,1997-05-12 05:30:00,,360,464,PHTX,True,False
3,1997-05-21 20:20:00,1997-05-21 22:00:00,5000,500,N05W12,8040,M1.3,1997-05-21 21:00:00,263,165,296,PHTX,False,False
4,1997-09-23 21:53:00,1997-09-23 22:16:00,6000,2000,S29E25,8088,C1.4,1997-09-23 22:02:00,133,155,712,PHTX,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,2017-09-17 11:45:00,2017-09-17 12:35:00,16000,900,S08E170,,,2017-09-17 12:00:00,,360,1385,PHTX,True,False
518,2017-10-18 05:48:00,2017-10-18 12:40:00,16000,400,S06E123,,,2017-10-18 08:00:00,85,146,1001,PHTX,False,False
519,2019-05-03 23:52:00,2019-05-04 00:16:00,13000,2300,N12E82,12740,C1.0,2019-05-03 23:24:00,90,113,692,PHTX,False,False
520,2020-11-29 13:07:00,2020-11-29 15:23:00,14000,850,S23E89,,M4.4,2020-11-29 13:25:00,,360,2077,,True,False


In [220]:
# Isolate rows with max intensity flares 
# (only 50. Also technically already sorted at this point)
data = data.loc[data['X_class'].str.contains('X')]

# Remove X from classifications and sort as float
data['X_class'] = data['X_class'].str.lstrip('X')
data['X_class'] = data['X_class'].str.rstrip('+')
data['X_class'] = data.X_class.astype(float)
data = data.head(50).sort_values('X_class', ascending= False)

# Add the X back on for later comparision in part 2
data['X_class'] = data.X_class.astype(str)
data['X_class'] = 'X' + data['X_class']

data

Unnamed: 0,Rank,X_class,Start_DT,Max_DT,End_DT,Region
0,1,X28.0,2003-11-04 19:29:00,2003-11-04 19:53:00,2003-11-04 20:06:00,486
1,2,X20.0,2001-04-02 21:32:00,2001-04-02 21:51:00,2001-04-02 22:03:00,9393
2,3,X17.2,2003-10-28 09:51:00,2003-10-28 11:10:00,2003-10-28 11:24:00,486
3,4,X17.0,2005-09-07 17:17:00,2005-09-07 17:40:00,2005-09-07 18:03:00,808
4,5,X14.4,2001-04-15 13:19:00,2001-04-15 13:50:00,2001-04-15 13:55:00,9415
5,6,X10.0,2003-10-29 20:37:00,2003-10-29 20:49:00,2003-10-29 21:01:00,486
6,7,X9.4,1997-11-06 11:49:00,1997-11-06 11:55:00,1997-11-06 12:01:00,8100
7,8,X9.3,2017-09-06 11:53:00,2017-09-06 12:02:00,2017-09-06 12:10:00,2673
8,9,X9.0,2006-12-05 10:18:00,2006-12-05 10:35:00,2006-12-05 10:45:00,930
9,10,X8.3,2003-11-02 17:03:00,2003-11-02 17:25:00,2003-11-02 17:39:00,486


In [221]:
# Now do the same for the NASA data 
n_top = nasa_data.loc[nasa_data['Importance'].str.contains('X')].copy(deep= True)

# Remove X from classifications and sort as float
n_top['Importance'] = n_top['Importance'].str.lstrip('X')
n_top['Importance'] = n_top['Importance'].str.rstrip('+')
n_top['Importance'] = n_top.Importance.astype(float)
n_top = n_top.head(50).sort_values('Importance', ascending= False)

# Add the X back on, will be useful in part 2
n_top['Importance'] = n_top.Importance.astype(str)
n_top['Importance'] = 'X' + n_top['Importance']
n_top

Unnamed: 0,Start_DT,End_DT,Start_Freq,End_Freq,Flare_Loc,Flare_Region,Importance,CME_DT,Angle,Width,Speed,Plot,Halo,Lower_Bound
117,2001-04-02 22:05:00,2001-04-03 02:30:00,14000,250,N19W72,9393,X20.0,2001-04-02 22:06:00,261.0,244,2505,PHTX,False,False
233,2003-10-28 11:10:00,2003-10-29 00:00:00,14000,40,S16E08,10486,X17.0,2003-10-28 11:30:00,,360,2459,PHTX,True,False
126,2001-04-15 14:05:00,2001-04-16 13:00:00,14000,40,S20W85,9415,X14.0,2001-04-15 14:06:00,245.0,167,1199,PHTX,False,False
234,2003-10-29 20:55:00,2003-10-29 00:00:00,11000,500,S15W02,10486,X10.0,2003-10-29 20:54:00,,360,2029,PHTX,True,False
8,1997-11-06 12:20:00,1997-11-07 08:30:00,14000,100,S18W63,8100,X9.4,1997-11-06 12:10:00,,360,1556,PHTX,True,False
237,2003-11-02 17:30:00,2003-11-03 01:00:00,12000,250,S14W56,10486,X8.3,2003-11-02 17:30:00,,360,2598,PHTX,True,False
82,2000-07-14 10:30:00,2000-07-15 14:30:00,14000,80,N22W07,9077,X5.7,2000-07-14 10:54:00,,360,1674,PHTX,True,False
121,2001-04-06 19:35:00,2001-04-07 01:50:00,14000,230,S21E31,9415,X5.6,2001-04-06 19:30:00,,360,1270,PHTX,True,False
135,2001-08-25 16:50:00,2001-08-25 23:00:00,8000,170,S17E34,9591,X5.3,2001-08-25 16:50:00,,360,1433,PHTX,True,False
193,2002-07-23 00:50:00,2002-07-23 04:00:00,11000,400,S13E72,10039,X4.8,2002-07-23 00:42:00,,360,2285,PHTX,True,False


In [222]:
# The data is replicable. Many of the highest intensity flares are shown in both tables
# along with similar regions even though the time is a little off

# End of step 1

In [223]:
# For step 2, I define a matching row as one that has the same intensity (x-class)
# and the same region. Also wanted to use time but the minutes are all off 

# Create new column for rows that match
nasa_data['SW_Rank'] = 'NaN'

# Create fxn 
def matcher(index):
    for temp, row in data.iterrows():
        # First check the classifications
        if nasa_data.iat[index, 6] == row[1]:
            # Now check the region
            if float((nasa_data.iat[index,5])[-4:]) == float(row[5]):
                # If match, then return the rank from the sw data
                return row['Rank']
    return 'NaN'

# Apply the matcher to each row in the nasa table and adjust the new column 
for i,row2 in nasa_data.iterrows():
    nasa_data.iat[i,14] = matcher(i)

nasa_data

Unnamed: 0,Start_DT,End_DT,Start_Freq,End_Freq,Flare_Loc,Flare_Region,Importance,CME_DT,Angle,Width,Speed,Plot,Halo,Lower_Bound,SW_Rank
0,1997-04-01 14:00:00,1997-04-01 14:15:00,8000,4000,S25E16,8026,M1.3,1997-04-01 15:18:00,74,79,312,PHTX,False,False,
1,1997-04-07 14:30:00,1997-04-07 17:30:00,11000,1000,S28E19,8027,C6.8,1997-04-07 14:27:00,,360,878,PHTX,True,False,
2,1997-05-12 05:15:00,1997-05-14 16:00:00,12000,80,N21W08,8038,C1.3,1997-05-12 05:30:00,,360,464,PHTX,True,False,
3,1997-05-21 20:20:00,1997-05-21 22:00:00,5000,500,N05W12,8040,M1.3,1997-05-21 21:00:00,263,165,296,PHTX,False,False,
4,1997-09-23 21:53:00,1997-09-23 22:16:00,6000,2000,S29E25,8088,C1.4,1997-09-23 22:02:00,133,155,712,PHTX,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,2017-09-17 11:45:00,2017-09-17 12:35:00,16000,900,S08E170,,,2017-09-17 12:00:00,,360,1385,PHTX,True,False,
518,2017-10-18 05:48:00,2017-10-18 12:40:00,16000,400,S06E123,,,2017-10-18 08:00:00,85,146,1001,PHTX,False,False,
519,2019-05-03 23:52:00,2019-05-04 00:16:00,13000,2300,N12E82,12740,C1.0,2019-05-03 23:24:00,90,113,692,PHTX,False,False,
520,2020-11-29 13:07:00,2020-11-29 15:23:00,14000,850,S23E89,,M4.4,2020-11-29 13:25:00,,360,2077,,True,False,


In [225]:
# Check to make sure the last cell actually made modifications
checker = nasa_data.loc[~(nasa_data['SW_Rank'] == 'NaN')]
checker

# End of question 3

Unnamed: 0,Start_DT,End_DT,Start_Freq,End_Freq,Flare_Loc,Flare_Region,Importance,CME_DT,Angle,Width,Speed,Plot,Halo,Lower_Bound,SW_Rank
8,1997-11-06 12:20:00,1997-11-07 08:30:00,14000,100,S18W63,8100,X9.4,1997-11-06 12:10:00,,360.0,1556.0,PHTX,True,False,7
19,1998-05-06 08:25:00,1998-05-06 08:35:00,14000,5000,S11W65,8210,X2.7,1998-05-06 08:29:00,309.0,190.0,1099.0,PHTX,False,False,48
82,2000-07-14 10:30:00,2000-07-15 14:30:00,14000,80,N22W07,9077,X5.7,2000-07-14 10:54:00,,360.0,1674.0,PHTX,True,False,17
104,2000-11-26 17:00:00,2000-11-26 17:15:00,14000,7000,N18W38,9236,X4.0,2000-11-26 17:06:00,,360.0,980.0,PHTX,True,False,26
121,2001-04-06 19:35:00,2001-04-07 01:50:00,14000,230,S21E31,9415,X5.6,2001-04-06 19:30:00,,360.0,1270.0,PHTX,True,False,18
135,2001-08-25 16:50:00,2001-08-25 23:00:00,8000,170,S17E34,9591,X5.3,2001-08-25 16:50:00,,360.0,1433.0,PHTX,True,False,22
142,2001-09-24 10:45:00,2001-09-25 20:00:00,7000,30,S16E23,9632,X2.6,2001-09-24 10:30:00,,360.0,2402.0,PHTX,True,False,50
192,2002-07-20 21:30:00,2002-07-20 22:20:00,10000,2000,S13E90,10039,X3.3,2002-07-20 22:06:00,,360.0,1941.0,PHTX,True,False,37
193,2002-07-23 00:50:00,2002-07-23 04:00:00,11000,400,S13E72,10039,X4.8,2002-07-23 00:42:00,,360.0,2285.0,PHTX,True,False,25
201,2002-08-24 01:45:00,2002-08-24 03:25:00,5000,400,S02W81,10069,X3.1,2002-08-24 01:27:00,,360.0,1913.0,PHTX,True,False,41
