# Dataset Metadata and General Description 

In [None]:
'''
Flare - Numeric - An ID number, ymmddnn, e.g., 2042101 is the first flare found for 21-Apr-2002. 
        These numbers are not time ordered.
Start.date - DateTime - The date when the flare occurred
Start.time - DateTime - Flare start time
Peak -DateTime - Flare peak time
End - String - Flare end time
Dur[s] - Numeric - Duration of flare in seconds
Peak[c/s] - Numeric - Peak count rate in corrected counts, peak counts/second
Total Counts - Numeric -  Total of counts in corrected counts, counts in energy range
Energy [keV] - String - No DescriptionThe highest energy band in which the flare was observed.
X pos [asec] -  Numeric -  Flare position in arcsec from sun center
Y pos [asec] - Numeric -  Flare position in arcsec from sun center
Radial [asec] - Radial distance in arcsec from sun center
active.region.ar - String - No Description
flag.1 - String - No Description
flag.2 - String - No Description
flag.3 - String - No Description
flag.4 -  String - No Description
flag.5 -  String - No Description

###################### Flags - Quality Codes ######################
    Flare Flag Codes:
    List item
        a0 - In attenuator state 0 (None) sometime during flare
        a1 - In attenuator state 1 (Thin) sometime during flare
        a2 - In attenuator state 2 (Thick) sometime during flare
        a3 - In attenuator state 3 (Both) sometime during flare
        An - Attenuator state (0=None, 1=Thin, 2=Thick, 3=Both) at peak of flare
        DF - Front segment counts were decimated sometime during flare
        DR - Rear segment counts were decimated sometime during flare
        ED - Spacecraft eclipse (night) sometime during flare
        EE - Flare ended in spacecraft eclipse (night)
        ES - Flare started in spacecraft eclipse (night)
        FE - Flare ongoing at end of file
        FR - In Fast Rate Mode
        FS - Flare ongoing at start of file
        GD - Data gap during flare
        GE - Flare ended in data gap
        GS - Flare started in data gap
        MR - Spacecraft in high-latitude zone during flare
        NS - Non-solar event
        PE - Particle event: Particles are present
        PS - Possible Solar Flare; in front detectors, but no position
        Pn - Position Quality: P0 = Position is NOT valid, P1 = Position is valid
        Qn - Data Quality: Q0 = Highest Quality, Q11 = Lowest Quality
        SD - Spacecraft was in SAA sometime during flare
        SE - Flare ended when spacecraft was in SAA
        SS - Flare started when spacecraft was in SAA

'''

In [1]:
from __future__ import division, print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
df = pd.read_csv('data/hessi.solar.flare.2002to2016.csv')
df.head(5)

Unnamed: 0,flare,start.date,start.time,peak,end,duration.s,peak.c/s,total.counts,energy.kev,x.pos.asec,y.pos.asec,radial,active.region.ar,flag.1,flag.2,flag.3,flag.4,flag.5
0,2021213,2002-02-12,21:29:56,21:33:38,21:41:48,712,136,167304,12-25,592,-358,692,0,A1,P1,,,
1,2021228,2002-02-12,21:44:08,21:45:06,21:48:56,288,7,9504,6-12,604,-341,694,9811,A1,P1,PE,Q1,
2,2021332,2002-02-13,00:53:24,00:54:54,00:57:00,216,15,11448,6-12,-310,375,487,9825,A1,P1,,,
3,2021308,2002-02-13,04:22:52,04:23:50,04:26:56,244,20,17400,12-25,-277,378,469,9822,A1,P1,,,
4,2021310,2002-02-13,07:03:52,07:05:14,07:07:48,236,336,313392,25-50,-272,390,476,9825,A1,GS,P1,PE,Q2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113942 entries, 0 to 113941
Data columns (total 18 columns):
flare               113942 non-null int64
start.date          113942 non-null object
start.time          113942 non-null object
peak                113942 non-null object
end                 113942 non-null object
duration.s          113942 non-null int64
peak.c/s            113942 non-null int64
total.counts        113942 non-null int64
energy.kev          113942 non-null object
x.pos.asec          113942 non-null int64
y.pos.asec          113942 non-null int64
radial              113942 non-null int64
active.region.ar    113942 non-null int64
flag.1              113942 non-null object
flag.2              113942 non-null object
flag.3              94035 non-null object
flag.4              93740 non-null object
flag.5              52991 non-null object
dtypes: int64(8), object(10)
memory usage: 15.6+ MB


In [4]:
df.describe()

Unnamed: 0,flare,duration.s,peak.c/s,total.counts,x.pos.asec,y.pos.asec,radial,active.region.ar
count,113942.0,113942.0,113942.0,113942.0,113942.0,113942.0,113942.0,113942.0
mean,10864590.0,493.051728,212.321532,377201.3,-9.752181,-43.129812,689.528892,979.922926
std,9832864.0,433.389295,833.776871,3071203.0,757.017401,404.547666,513.203089,1350.484625
min,2021213.0,8.0,0.0,8.0,-10012.0,-10005.0,0.0,0.0
25%,4110734.0,212.0,28.0,22920.0,-704.0,-249.0,469.0,0.0
50%,11120220.0,364.0,53.0,58574.0,0.0,-75.0,759.0,687.0
75%,14030130.0,628.0,144.0,179442.0,706.0,198.0,946.0,1543.0
max,141216100.0,4444.0,113156.0,435550100.0,1002.0,1012.0,14154.0,9999.0


In [5]:
df.isnull().any()

flare               False
start.date          False
start.time          False
peak                False
end                 False
duration.s          False
peak.c/s            False
total.counts        False
energy.kev          False
x.pos.asec          False
y.pos.asec          False
radial              False
active.region.ar    False
flag.1              False
flag.2              False
flag.3               True
flag.4               True
flag.5               True
dtype: bool

In [6]:
df.isnull().sum()

flare                   0
start.date              0
start.time              0
peak                    0
end                     0
duration.s              0
peak.c/s                0
total.counts            0
energy.kev              0
x.pos.asec              0
y.pos.asec              0
radial                  0
active.region.ar        0
flag.1                  0
flag.2                  0
flag.3              19907
flag.4              20202
flag.5              60951
dtype: int64

In [None]:
def heatmap(df,figsize=(25,25),annot_size = 8,cmap=sns.cubehelix_palette(start = 0.2,rot = 0.3,dark = 0.15,light = 0.85,as_cmap = True)):
    corr = df.corr()
    _,ax = plt.subplots(1,1,figsize=figsize)
    sns.heatmap(corr,
               cbar=True,
               cbar_kws={'shrink':0.9},
               annot=True,
               annot_kws={'fontsize':annot_size},
               cmap = cmap
               )
    plt.show()  
   
heatmap(df)

In [11]:
test_df = pd.read_csv('data/hessi.solar.flare.2002to2016.csv')

In [13]:
test_df.rename(columns={'start.date':'Date', 'energy.kev':'factor', 'total.counts':'numeric', 'active.region.ar':'factor'}, inplace=True)

In [15]:
test_df.head(3)

Unnamed: 0,flare,Date,start.time,peak,end,duration.s,peak.c/s,numeric,factor,x.pos.asec,y.pos.asec,radial,factor.1,flag.1,flag.2,flag.3,flag.4,flag.5
0,2021213,2002-02-12,21:29:56,21:33:38,21:41:48,712,136,167304,12-25,592,-358,692,0,A1,P1,,,
1,2021228,2002-02-12,21:44:08,21:45:06,21:48:56,288,7,9504,6-12,604,-341,694,9811,A1,P1,PE,Q1,
2,2021332,2002-02-13,00:53:24,00:54:54,00:57:00,216,15,11448,6-12,-310,375,487,9825,A1,P1,,,


In [16]:
df.columns

Index(['flare', 'start.date', 'start.time', 'peak', 'end', 'duration.s',
       'peak.c/s', 'total.counts', 'energy.kev', 'x.pos.asec', 'y.pos.asec',
       'radial', 'active.region.ar', 'flag.1', 'flag.2', 'flag.3', 'flag.4',
       'flag.5'],
      dtype='object')