## Process and display data from ARROW

1. Get filenames and read in data - Done
2. Aggregate baseline data - Done
3. Normalise data using baseline obs - Done
4. Convert frequency to velocity - Done
5. Save data + headers to file - Done
6. Aggregate data - Done
7. Display data
8. Think of at least a dozen synonyms for data so there aren't duplicate variables - Done



## Imports


In [34]:
import os #filesystem stuff
import fnmatch
from operator import itemgetter #to help with complex list sort
#from astropy import units as u
import numpy as np
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models.tools import HoverTool
#from astropy import constants as const

## Functions

#### Reads the data

In [35]:
def read_ARROW_data(path, filename):
    """Reads in and partially processes an  ARROW spectrum
    
    The spectrum file contains a number of header lines indicated by `#' or blanks. 
    This function splits these from the main data and returns both 
            
    Parameters
    ----------
    filename : str
    path : str
       filepath
    
    Returns
    -------
    dat : class: pandas.DataFrame
        Spectrum data
    Header lines : list of str
        List of header lines
    """
     
    # Read lines till first line not starting with #, or whitespace.
    # Store these as a list
    #path_filename=path+'\\'+filename
    path_filename=os.path.join(path, filename)
    angle=filename.split('_')[0]
    header_list = []
    number_header_lines = 0
    dat = None
    with open(path_filename) as f:
        line = f.readline()
        while line[0] == '#' or line[0] == ',' or line[0].isspace():
            header_list.append(line)
            number_header_lines += 1
            line = f.readline()
        dat = pd.read_csv(path_filename, header = number_header_lines, skipinitialspace = True, index_col='frequency')

    return (angle, dat, header_list,filename)

####  Processes the data to take the average of sets of obervations

In [36]:
def aggregate_d(data_list,ang): 
    """ Takes in a list of dataframes and an angle and contatenates the results from a 
    number of observation and returns a df with the averaged result
        
    Parameters
    ----------
    data_list : list, str
    ang: str
    
    Returns
    -------
    dat : class: pandas.DataFrame
        Averaged spectrum data
    """
    x=[data for angle, data, header,filename in data_list if angle.startswith(ang)]
    df=pd.concat((x),axis=1)
    df.columns=[item + '_'+  str(n+1) for n,item in enumerate(df.columns)]
    df['mean intensity']=df.mean(axis=1)
    return df

#### Subtract the baseline data and save and display normalised data

In [37]:
def normalise_data(data, baseline):
    """ Takes in a list of angle, dataFrame and a pandas series of the baseline observation and
    returns a new data frame with the baseline and normalised_ data added as columns. 
            
    Parameters
    ----------
    data : tuple, str, pandas.DataFrame
    baseline: pandas.Series
    
    Returns
    -------
    new_df : class: pandas.DataFrame
    """
        
    if data[0] !='BG':
        new_df=data[1]
        new_df['baseline']=baseline
        new_df['norm_data']=new_df['intensity']-new_df['baseline']
        return new_df

#### Convert frequency to velocity

In [38]:
def freq_to_vel(freq, f0=1420.4e6):
    ''' Takes a frequency value (or Pandas Dataframe column or Series) and returns
    a velocity value (or new Dataframe column of values). f0 is the rest
    frequency and defaults to 1420.4 MHz'''

    C = 299792458.0  #m/s
    #or use C=const.c 
    v=-(freq*C)/f0
    return v/1000  #(km/s)     

#### Saves data list items back to file

In [39]:
def save_data(data_list):
    
    '''Takes the data list and writes the header and dataframe out to a csv file making a directory if necessory
        
        Parameters
        ----------
        datalist : list [list, pandas.DataFrame]

        Returns
        -------
        void  
    '''
    try: #Easier to ask forgiveness than seek permission
        os.mkdir('.\\proc_data')
    except:
        pass
    save_path = '.\\proc_data\\'
    for _,df,header,fn in data_list:
        new_fn= fn[:-4] +"_proc.csv"
        completeName = os.path.join(save_path, new_fn)
        with open(completeName, mode='w') as data_file:
            data_file.write("".join(header))
        df.to_csv(completeName, mode='a')

#### Loops over a list of angles and calls the aggregating list function for each group of obs
There's currently two functions doing much the same thing for the background data and the main data as they need slighty different treatment - not 100% satisfactory but hey ho.


In [40]:
def agg(obs_list):
    #This is for the background obs
    angle_set={angle[0][0:2] if angle[0].startswith('BG') else angle[0] for angle in obs_list} #using set to remove dupes

    #sorts back into list - sets are unordered.  Function in a function - tut. 
    return sorted([[ang,aggregate_d(obs_list, ang)] for ang in angle_set],key=itemgetter(0)) 
    

#### Pretty much the same just calls a different function to do the processing

In [41]:
def aggregate2(data_list):
    angle_set={angle[0] for angle in data_list}
    #sorts back into list - sets are unordered
    return sorted([[angle,process_data(angle,data_list)]for angle in angle_set],key=itemgetter(0))

#### Takes the main datalist and an angle and returns an aggregated  dataframe for each angle

In [42]:
def process_data(ang,datalist):
    ''' Stuff here I suppose
    '''
    dataframe= datalist[0][1]
    x=[data for angle, data,_,_ in data_list if angle.startswith(ang)]
    
    # smashes everythign together into a big df
    for frame in x:
        dataframe = pd.merge(dataframe,frame, on=['baseline','frequency','velocity'])     
  
    #filters for intensity and normalised data
    cols= [col for col in dataframe.columns if not(col.startswith('int'))]
    n_cols=[col for col in dataframe.columns if not(col.startswith('norm'))]
    
    #chops up the big df and puts it together with the means of intensity and normalised intensity 
    mean_int=dataframe.drop(cols, axis =1).mean(axis=1)
    int_norm=dataframe.drop(n_cols, axis =1).mean(axis=1)
    new_df=pd.DataFrame({'mean_int':mean_int, 'mean_norm':int_norm})
    new_df['velocity']=dataframe['velocity']
    new_df['baseline']=dataframe['baseline']
    return new_df

## Main

#### Get the filenames from the specified directory and sends a list of them off to be read.


In [43]:
basepath = '.\\data\\'

#gets a list of csv file
files=[entry.name for entry in os.scandir(basepath) if fnmatch.fnmatch(entry.name, '*.csv')]

#processes the background data files
background_list=[read_ARROW_data(basepath, file) for file in files if fnmatch.fnmatch(file, 'BG*')] 

#processes the target data files
data_list=[read_ARROW_data(basepath, file) for file in files if not(fnmatch.fnmatch(file, 'BG*'))]        

####  Calls the function to aggregate the background data

In [81]:
agg_bg=agg(background_list)

#### Iterates over the sorted data list and calls the normalise_data function on each item

In [46]:
baseline=agg_bg[0][1]['mean intensity'] #plucks the mean background values from the df

norm_data_list=[(data[0],normalise_data(data,baseline),data[2],data[3])for data in data_list]

#### Converts frequency to velocity

In [48]:
for item in norm_data_list:
    item[1]['velocity']=freq_to_vel(item[1].index)

#### Saves data (duh)

In [49]:
save_data(norm_data_list)

## Display the data as graphs

#### Calls the function to aggregate the data into one df.

In [80]:
display_data = aggregate2(norm_data_list)

#### Sets up the variables for the graphs

In [52]:
angle,df40 = display_data[0]
angle,df50 = display_data[1]
angle,df60 = display_data[2]
angle,df70 = display_data[3]
angle,df80 = display_data[4]

#### Outputs all the data on one graph with hiding enabled

In [72]:
output_notebook()

p1 = figure(title = "Spectral data from Galactic observations with baseline subtracted", 
            plot_width=800, plot_height=400,
          x_axis_label='Velocity (km / s)', 
          y_axis_label='Intensity')
p1.line(df40['velocity'],df40['mean_int']-df40['baseline'], legend_label='l=40')
p1.line(df50['velocity'],df50['mean_int']-df40['baseline'], color='red', legend_label='l=50')
p1.line(df60['velocity'],df60['mean_int']-df40['baseline'], color='green', legend_label='l=60')
p1.line(df70['velocity'],df70['mean_int']-df40['baseline'], legend_label='l=70', color='purple')
p1.line(df80['velocity'],df80['mean_int']-df40['baseline'], legend_label='l=80', color='orange')
p1.add_tools(HoverTool(mode='vline'))
p1.legend.location = "top_left"
p1.legend.click_policy="hide"
show(p1)

#### Outputs a multi-graph grid

In [78]:
from bokeh.layouts import gridplot
from bokeh.models import Range1d

# Let's just set up an 'x' value once here.
xvals = df40['velocity']

s1 = figure(plot_width=400, plot_height=300, title='l = 40',
            x_axis_label='Velocity (kms^-1)', 
            y_axis_label='Intensity')
s1.line(xvals,df40['mean_int'], color='red')
s2 = figure(plot_width=400, plot_height=300, title='l = 50',
            x_axis_label='Velocity (kms^-1)', 
            y_axis_label='Intensity')
s2.line(xvals,df50['mean_int'], color='red')
s3 = figure(plot_width=400, plot_height=300, title='l = 60',
            x_axis_label='Velocity (kms^-1)', 
            y_axis_label='Intensity')
s3.line(xvals,df60['mean_int'], color='red')
s4 = figure(plot_width=400, plot_height=300, title='l = 70',
            x_axis_label='Velocity (kms^-1)', 
            y_axis_label='Intensity')
s4.line(xvals,df70['mean_int'],color='red')
s5 = figure(plot_width=400, plot_height=300, title='l = 80',
            x_axis_label='Velocity (kms^-1)', 
            y_axis_label='Intensity')
s5.line(xvals,df80['mean_int'],color='red')
s6 = figure(plot_width=400, plot_height=300, title='Baseline - (NEP)',
            x_axis_label='Velocity (kms^-1)', 
            y_axis_label='Intensity')
s6.line(xvals, df70['baseline'], color='purple')
s6.add_tools(HoverTool(mode='vline'))
6.y_range = Range1d(2,3)  # Matches the scale of the baseline graph to the other graphhs.

grid = gridplot([[s1,s2],[s3,s4], [s5,s6]])
show(grid)



#### Odds and sods

In [57]:
test=pd.DataFrame(columns=['this column', 'that column'])

In [58]:
list=[{'Latitude':angle, 'Max velocity': round(df.loc[df['mean_norm'].idxmax()][2],3)} for angle,df in display_data]
#df50.loc[df80['mean_norm'].idxmax()][2]

In [59]:
max_vel=pd.DataFrame(list)

In [60]:
max_vel.set_index('Latitude')

Unnamed: 0_level_0,Max velocity
Latitude,Unnamed: 1_level_1
40,35.881
50,7.387
60,7.387
70,1.055
80,-3.166


In [61]:
range(12)

range(0, 12)

In [62]:
for i in range(1):
    print(i)

0


In [63]:
A=['a','b','c']
B=['is for apple', 'is for bytecode', "is for C++"]
C=zip(A,B)

In [64]:
print(C)

<zip object at 0x00000215884A93C0>


In [65]:
# for item, item2 in C:
#     print(f'{item} {item2}')

In [66]:
D=enumerate(C)
for item,(item1,item2) in D:
    print(f'{item+1}. {item1} {item2}')

1. a is for apple
2. b is for bytecode
3. c is for C++
