## Process and display data from ARROW

1. Get filenames and read in data - Done
2. Aggregate baseline data - Done
3. Normalise data using baseline obs - Done
4. Convert frequency to velocity - Done
5. Save data + headers to file - Done
6. Aggregate data - Done
7. Display data
8. Think of at least a dozen synonyms for data so there aren't duplicate variables - Done



#### Various imports


In [35]:
import os #filesystem stuff
from operator import itemgetter #to help with complex list sort
import numpy as np
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models.tools import HoverTool

#### Some sort of function that uses the file name to read in the data and returns a list of tuples containing `[(angle, data, header),...]`  

In [20]:
def read_ARROW_data(path, filename):
    """Reads in and partially processes an  ARROW spectrum
    
    The spectrum file contains a number of header lines indicated by `#' or blanks. 
    This function splits these from the main data and returns both 
            
    Parameters
    ----------
    filename : str
    path : str
       filepath
    
    Returns
    -------
    dat : class: pandas.DataFrame
        Spectrum data
    Header lines : list of str
        List of header lines
    """
     
    # Read lines till first line not starting with #, or whitespace.
    # Store these as a list
    path_filename=path+'\\'+filename
    angle=filename.split('_')[0]
    header_list = []
    number_header_lines = 0
    dat = None
    with open(path_filename) as f:
        line = f.readline()
        while line[0] == '#' or line[0] == ',' or line[0].isspace():
            header_list.append(line)
            number_header_lines += 1
            line = f.readline()
        dat = pd.read_csv(path_filename, header = number_header_lines, skipinitialspace = True, index_col='frequency')

    return (angle, dat, header_list,filename)

####  Processes the data to take the average of sets of obervations

In [21]:
def aggregate_d(data_list,ang): 
    """ Takes in a list of dataframes and an angle and contatenates the results from a 
    number of observation and returns a df with the averaged result
        
    Parameters
    ----------
    data_list : list, str
    ang: str
    
    Returns
    -------
    dat : class: pandas.DataFrame
        Averaged spectrum data
    """
    x=[data for angle, data, header,filename in data_list if angle.startswith(ang)]
    df=pd.concat((x),axis=1)
    df.columns=[item + '_'+  str(n+1) for n,item in enumerate(df.columns)]
    df['mean intensity']=df.mean(axis=1)
    return df

#### Subtract the baseline data and save and display normalised data

In [22]:
def normalise_data(data, baseline):
    """ Takes in a list of angle, dataFrame and a pandas series of the baseline observation and
    returns a new data frame with the baseline and normalised_ data added as columns. 
            
    Parameters
    ----------
    data : tuple, str, pandas.DataFrame
    baseline: pandas.Series
    
    Returns
    -------
    new_df : class: pandas.DataFrame
    """
        
    if data[0] !='BG':
        new_df=data[1]
        new_df['baseline']=baseline
        new_df['norm_data']=new_df['intensity']-new_df['baseline']
        return new_df

#### Convert frequency to velocity

In [23]:
def freq_to_vel(freq, f0=1420.4e6):
    ''' Takes a frequency value (or Pandas Dataframe column or Series) and returns
    a velocity value (or new Dataframe column of values). f0 is the rest
    frequency and defaults to 1420.4 MHz'''
    
    # We need a value for 'c' - thespeed of light. 
    # You can either define it explicitly here or alternatively, use the 
    # Astropy constants
    c = 299792458.0  #m/s
    v=-(freq*c)/f0    
    return v/1000  #(km/s)     

#### Saves data list items back to file

In [24]:
def save_data(data_list):
    
    '''Takes the data list and writes the header and dataframe out to a csv file making a directory if necessory
        
        Parameters
        ----------
        datalist : list [list, pandas.DataFrame]

        Returns
        -------
        void  
    '''
    try: #Easier to ask forgiveness than seek permission
        os.mkdir('D:\\OU2\\OneDrive - The Open University\\OU 2021\\SXPS277\\python\\ARROW\\proc_data')
    except:
        pass
    save_path = 'D:\\OU2\\OneDrive - The Open University\\OU 2021\\SXPS277\\python\\ARROW\\proc_data'
    for _,df,header,fn in data_list:
        new_fn= fn[:-4] +"_proc.csv"
        completeName = os.path.join(save_path, new_fn)
        with open(completeName, mode='w') as data_file:
            data_file.write("".join(header))
        df.to_csv(completeName, mode='a')

#### Loops over a list of angles and calls the aggregating list function for each group of obs
There's currently two functions doing much the same thing for the background data and the main data as tey need slighty different treatment - not 100% staidfoctory but hey ho.

In [25]:
def agg(obs_list):
    #This is for the background obs
    angle_set={angle[0][0:2] if angle[0].startswith('BG') else angle[0] for angle in obs_list} #using set to remove dupes

    #sorts back into list - sets are unordered.  Function in a function - tut. 
    return sorted([[ang,aggregate_d(obs_list, ang)] for ang in angle_set],key=itemgetter(0)) 
    

#### Pretty much thesame just calls a different function to do the processing

In [26]:
def aggregate2(data_list):
    angle_set={angle[0] for angle in data_list}
    #sorts back into list - sets are unordered
    return sorted([[angle,process_data(angle,data_list)]for angle in angle_set],key=itemgetter(0))

#### Takes the main datalist and an angle and returns an aggregated  dataframe for each angle

In [27]:
def process_data(ang,datalist):
    ''' Stuff here I suppose
    '''
    dataframe= datalist[0][1]
    x=[data for angle, data,_,_ in data_list if angle.startswith(ang)]
    
    # smashes everythign together into a big df
    for frame in x:
        dataframe = pd.merge(dataframe,frame, on=['baseline','frequency','velocity'])     
  
    #filters for intensity and normalised data
    cols= [col for col in dataframe.columns if not(col.startswith('int'))]
    n_cols=[col for col in dataframe.columns if not(col.startswith('norm'))]
    
    #chops up the big df and puts it together with the means of intensity and normalised intensity 
    mean_int=dataframe.drop(cols, axis =1).mean(axis=1)
    int_norm=dataframe.drop(n_cols, axis =1).mean(axis=1)
    new_df=pd.DataFrame({'mean_int':mean_int, 'mean_norm':int_norm})
    new_df['velocity']=dataframe['velocity']
    new_df['baseline']=dataframe['baseline']
    return new_df

#### Get the filenames from the specified directory

In [28]:
basepath = 'D:\\OU2\\OneDrive - The Open University\\OU 2021\\SXPS277\\python\\ARROW\\sample_data'
with os.scandir(basepath) as entries:
    files=[entry.name for entry in entries if entry.name.find('.csv') !=-1] 
    background_list=[read_ARROW_data(basepath, file) for file in files if file.startswith("BG")]
    data_list=[read_ARROW_data(basepath, file) for file in files if not(file.startswith("BG"))]


####  Calls the function to aggregate the background data

In [29]:
agg_bg=agg(background_list)


#### Iterates over the sorted data list and calls the normalise_data function on each item

In [30]:
baseline=agg_bg[0][1]['mean intensity'] #plucks the mean background values from the df

norm_data_list=[(data[0],normalise_data(data,baseline),data[2],data[3])for data in data_list]


#### Converts frequency to velocity

In [31]:
for item in norm_data_list:
    item[1]['velocity']=freq_to_vel(item[1].index)

#### Saves data (duh)

In [32]:
save_data(norm_data_list)

#### To do

1. Aggregate data - done
2. Do the display stuff

Strategy for aggregaton is to make one big df from each set of obs, then subsetting to get teh means of intensity and teh normalised intensity and then bulding a new df from that lot. 

#### Display the data as tables and as graphs

In [33]:
# To do

Other stuff 

In [38]:
display_data = aggregate2(norm_data_list)

In [71]:
angle,df = display_data[0]

In [72]:
angle

'030'

In [77]:
df

Unnamed: 0_level_0,mean_int,mean_norm,velocity,baseline
frequency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-800000,1.895,0.0040,168.849596,1.8910
-795000,1.885,0.0185,167.794286,1.8665
-790000,1.855,0.0045,166.738976,1.8505
-785000,1.863,0.0295,165.683666,1.8335
-780000,1.853,0.0285,164.628356,1.8245
...,...,...,...,...
1180000,2.461,0.0235,-249.053154,2.4375
1185000,2.476,0.0345,-250.108464,2.4415
1190000,2.473,0.0380,-251.163774,2.4350
1195000,2.490,0.0390,-252.219084,2.4510


In [78]:
output_notebook() # This just makes sure the ouput appears here, in the notebook
p = figure(title = f"Frequency vs Intesity at {angle[1:3]} degrees latitude", 
          x_axis_label = 'velocity (km \ s)', 
          y_axis_label = 'Intensity')
# Now complete the next line given the information given just above
p.line(
    df["velocity"],
    df["mean_norm"])
show(p)