In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Functions for dataset alignment and interpolation

In [2]:
def add_15min_values(data, valid_dates):
    new_data = data.resample('1min').interpolate(method='linear')
    new_data = new_data[new_data.index.minute.isin([0, 15, 30, 45])]
    
    new_data['predicted'] = True
    data['predicted'] = False
    
    new_data = pd.concat([data,
                    new_data[~new_data.index.isin(data.index)]])
    new_data.sort_index(inplace=True)
    new_data['datetime_col'] = new_data.index
    new_data = new_data[new_data.datetime_col.apply(
      lambda x: x.date() in valid_dates)]
    
    new_data.loc[new_data.predicted == False, 'next'] = new_data.loc[new_data.predicted == False, 
                                                             'datetime_col'].shift(-1)
    new_data['diff'] = new_data.apply(lambda x: 
                                      (x['next'] - x['datetime_col']).seconds / 60,
                                      axis=1)
    new_data.loc[new_data.next == new_data.datetime_col, 'diff'] = 0

    new_data['next'].fillna(method='pad', inplace=True)
    new_data['diff'].fillna(method='pad', inplace=True)
    
    new_data['dist_to_closest_known'] = \
      new_data.apply(lambda x: min(
          (x['next'] - x['datetime_col']).seconds,
          (x['datetime_col'] 
           - (x['next'] - np.timedelta64(int(x['diff']), 'm'))
          ).seconds) / 60,
                     axis=1)
        
    return new_data[new_data.index.minute.isin([0, 15, 30, 45])]

def get_err(data, error_factor):
    if error_factor == 'dist_to_closest_known':
        data['err'] = data['dist_to_closest_known'].apply(
            lambda x: 0.02*x + 0.4 if x != 0 else 0)
    if error_factor == 'diff':
        # TODO: найти зависимость ошибки от времени между соседними известными значениями
        data['err'] = data['diff'].apply(lambda x: x)
        
    return data

def get_corrected_timeseries(data, error_factor):
    valid_dates = set(data.index.date)
    data = data.dropna()
    if data.shape[0] == 0:
        return pd.DataFrame()
    param_name = data.columns[0]
    data = add_15min_values(data, valid_dates)
    
    data = get_err(data, error_factor)
    data.rename(columns={'err': param_name+'_err'}, inplace=True)
    
    return data[[param_name, param_name+'_err']]
    

def data_correction(filename, params=['foF2'], error_factor='dist_to_closest_known'):
    t = pd.read_csv(filename, sep='\t', parse_dates=['date'], engine='python')
    t['datetime'] = t.apply(lambda x: x['date'] 
                        + np.timedelta64(x['h'], 'h') 
                        + np.timedelta64(x['m'], 'm'),
                        axis=1)
    t = t[['datetime'] + params].set_index('datetime')
    t.sort_index(inplace=True)
    
    res = pd.DataFrame()
    for p in params:
        p_corrected = get_corrected_timeseries(t[[p]], error_factor)
        if p_corrected.shape[0] == 0:
            continue
        if res.shape[0] != 0:
            res = res.merge(p_corrected, 
                           left_index=True, right_index=True)
        else:
            res = p_corrected
    return res

# Example of using

In [3]:
t_corr = data_correction('../NCEI_dataset/ionosondes_data/NO369.csv',
                        ['foF2', 'TEC'])
t_corr.head(15)

Unnamed: 0_level_0,foF2,foF2_err,TEC,TEC_err
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-20 18:00:00,2.643,0.0,1.686,0.0
2011-01-20 18:15:00,2.67575,0.7,1.629875,0.7
2011-01-20 18:30:00,2.7085,1.0,1.57375,1.0
2011-01-20 18:45:00,2.74125,1.3,1.517625,1.3
2011-01-20 19:00:00,2.774,1.6,1.4615,1.6
2011-01-20 19:15:00,2.80675,1.3,1.405375,1.3
2011-01-20 19:30:00,2.8395,1.0,1.34925,1.0
2011-01-20 19:45:00,2.87225,0.7,1.293125,0.7
2011-01-20 20:00:00,2.905,0.0,1.237,0.0
2011-01-20 20:15:00,2.66,0.0,1.154,0.0
