# Data visualisering 

## Loading relevant modules

Loading relevant modules and creating relevant (and relative) path. Feel free to change them.

In [1]:
from My_tools import DataFileLoader as DFL
# from My_tools import StudyEstimators as SE

import pandas as pd
import numpy as np
import pickle as pk

# imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,IterativeImputer,KNNImputer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV

# visualising
import matplotlib.pyplot as plt 

# modeling
import statsmodels as sm 
import sklearn

In [2]:
ROOT = "../../"

RESULT_PATH = ROOT + "results/"

DATA_PATH = ROOT + "data/"
PLOT_PATH = RESULT_PATH + "plots/"
TABLE_PATH = RESULT_PATH + "tables/"
OTHER_PATH = RESULT_PATH + "other/"
AUGME_PATH = RESULT_PATH + "data/"

METADATA_PRELOAD_DATA_PATH = OTHER_PATH + "bin_data/"

DATA_INFO = DATA_PATH + "info/"
DATA_INFO_NIBIO_FILE = DATA_INFO  + "lmt.nibio.csv"
DATA_INFO_FROST_FILE = DATA_INFO + "Frost_stations.csv"
DATA_INFO_NIBIO2FROST_FILE = DATA_INFO + "StationIDInfo.csv"
DATA_FILE_SOIL_STATIONS = DATA_INFO + "'Stasjonsliste jordtemperatur modellering.xlsx'"

DATA_COLLECTION = DATA_PATH + "raw_data/"
DATA_COLLECTION_STAT = DATA_COLLECTION + "Veret paa Aas 2013- 2017/" # pattern -> 'Veret paa Aas 2013- 2017/Veret paa Aas {YYYY}.pdf'
DATA_COLLECTION_TIME = DATA_COLLECTION + "Time 2013- 2023/" # pattern -> Time{YYYY}.xlsx
DATA_COLLECTION_NIBIO = DATA_COLLECTION + "nibio/" # pattern -> weather_data_hour_stID{id}_y{year}.csv
DATA_COLLECTION_MET = DATA_COLLECTION + "MET/" # pattern -> StationTo_{id}_FROM_{FrostID}.csv

# ID definitions
station_names = pd.read_csv(DATA_INFO_NIBIO_FILE,
                          header=0,
                          index_col = "ID")

nibio_id = {
    "Innlandet" : ["11","17","26","27"],
    "Trøndelag" : ["15","57","34","39"],
    "Østfold" : ["37","41","52","118"],
    "Vestfold" : ["30","38","42","50"] # Fjern "50" for å se om bedre resultat
}

## Fetching and ploting data

In [None]:
force_load = False
if force_load:
    nibio_data_ungroup = DFL.DataFileLoader(DATA_COLLECTION_NIBIO,r"weather_data_hour_stID(\d{1,3})_y(\d{4}).csv",_iter_key = True)
    nibio_data_ungroup.load_data(names = ["Time","TM","RR","TJM10","TJM20"])
    nibio_data = nibio_data_ungroup.group_layer(nibio_id)

    nibio_data_raw_ungroup = DFL.DataFileLoader(DATA_COLLECTION_NIBIO,r"weather_data_raw_hour_stID(\d{1,3})_y(\d{4}).csv",_iter_key = True)
    nibio_data_raw_ungroup.load_data(names = ["Time","TM","RR","TJM10","TJM20"])
    nibio_data_raw = nibio_data_raw_ungroup.group_layer(nibio_id)

    frost_raw_ungroup = DFL.DataFileLoader(DATA_COLLECTION_MET,r"weather_data_raw_hour_stID(\d{1,3})_y(\d{4}).csv",_iter_key = True)

    def dataframe_merge_func(x,y):
        y.iloc[y.iloc[:,1].notna() & (y.iloc[:,1] <= 0),2] = pd.NA
        x.iloc[0:y.shape[0],2] = y.iloc[0:y.shape[0],2]
        return x

    imputed_nibio_data = nibio_data.combine(nibio_data_raw,merge_func = dataframe_merge_func)
    imputed_nibio_data.dump(METADATA_PRELOAD_DATA_PATH + "weatherdata.bin")

    del nibio_data, nibio_data_raw, frost_raw_ungroup, nibio_data_raw_ungroup, nibio_data_ungroup
else: 
    imputed_nibio_data = DFL.DataFileLoader().load(METADATA_PRELOAD_DATA_PATH + "weatherdata.bin")

In [3]:
terskel_data = pd.read_csv(TABLE_PATH + "na_run_count_simp.csv",delimiter=";")
terskel = int(next(t.split(">")[-1] for t in terskel_data.columns if ">" in t))

### Ploting 


In [4]:
def avg_dataframe(*data_set): 
    data_set[0][1][0]["Time"] = data_set[0][1][0]["Time"].apply(lambda x: x.replace(year = 2000))
    average_data = data_set[0][1][0].fillna(0).set_index('Time') # initiates first data
    n = 1
    for _,station in data_set[1:]: # skips first since already accounted for 
        for data in station:
            data["Time"] = data["Time"].apply(lambda x: x.replace(year = 2000)) # to align all dataframes that spans over several years
            average_data = average_data.add(data.set_index('Time').subtract(average_data, fill_value=0).div(n+1)) 
            n += 1
        
    average_data = average_data.reset_index()
    return average_data

def differense(*data_set): 
    avg_data = avg_dataframe(*data_set)
    for _,station in data_set:
        for data in station:
            data["Time"] = data["Time"].apply(lambda x: x.replace(year = 2000))
    new_data_set = [(key,[data.set_index("Time").subtract(avg_data.set_index("Time")).reset_index() for data in station]) for key,station in data_set]
    return new_data_set

In [5]:
def heatmap_plot(diff_data,image_path,title = "heatmap of data",clear_plot = False):
    """
        Makes a heatmap of diff_data -> list[pd.DataFrame]
    """
    fig, ax = plt.subplots()

    cax = ax.matshow(
        arr := np.array([data.TJM20.to_numpy() for data in diff_data.values()]).transpose(),
        aspect='auto', cmap = "seismic"
    )

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(arr.shape[1]),
                  labels=list(diff_data.keys())
    )

    time_indexes = np.round(np.linspace(0, len((data_list := list(diff_data.values()))[0].Time.to_numpy()) - 1, 10)).astype(int)
    ax.set_yticks(
        time_indexes,
        labels=np.array([d.strftime("%d-%m") for d in data_list[0].Time.to_numpy()])[time_indexes]
    )

    C = len(diff_data.keys())/(2* len(nibio_id.keys()))
    range_keys = np.linspace(0,len(diff_data.keys()),num = len(nibio_id.keys())+1)
    # label the classes:
    sec = ax.secondary_xaxis(location=0)
    sec.set_xticks(range_keys[:-1] + C - 0.5, labels=['{}'.format(region) for region in nibio_id.keys()])
    sec.tick_params('x', length=0)

    # lines between the classes:
    sec2 = ax.secondary_xaxis(location=0)
    sec2.set_xticks(range_keys - 0.5, labels=[])
    sec2.tick_params('x', length=10, width=1.5)
    sec3 = ax.secondary_xaxis(location='top')
    sec3.set_xticks(range_keys - 0.5, labels=[])
    sec3.tick_params('x', length=10, width=1.5)

    # Rotate the tick labels and set their alignment.
    #plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
    #         rotation_mode="anchor")

    cbar = fig.colorbar(cax, label='Gradient')
    ax.set_title(title)
    ax.set_xlabel("\nStations")
    ax.set_ylabel("Date")
    #ax.set_aspect('equal', adjustable='datalim')
    plt.savefig(image_path)
    if clear_plot:
        plt.clf()

In [6]:
def polar_plot(diff_data,image_path,title = "polar plot of data",clear_plot = False): 
    fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
    list_diff_data = list(diff_data.items())
    matrix = np.array([data.TJM20.to_numpy() for _,data in list_diff_data]).transpose()
    ax.set_prop_cycle('color',[plt.cm.Blues(i) for i in np.linspace(0, 1, len(list_diff_data))])
    for col_ind in range(len(list_diff_data)):
        # Get the column vector
        vector = matrix[:, col_ind]
    
        # Compute the angle for each element in the vector
        angles = np.linspace(0, 2 * np.pi, len(vector), endpoint=False)
    
        # Plot the line connecting the origin to each element in the vector
        ax.plot(angles, vector, label=f"station {list_diff_data[col_ind]}")

    ax.grid(True)

    #range_keys = np.linspace(0,len(diff_data.keys()),num = len(nibio_id.keys())+1)
    time_indexes = np.linspace(0, 2*np.pi, 12, endpoint=False)
    ax.set_xticks(
        time_indexes,
        labels=np.array([d.strftime("%d-%m") for d in list(diff_data.values())[0].Time.to_numpy()])[(time_indexes* (list_diff_data[0][1]).shape[0]/(12)).astype(int)]
    )

    ax.set_title(title, va='bottom')
    #ax.legend()
    plt.savefig(image_path)
    if clear_plot:
        plt.clf()

In [7]:
def naive_plot(diff_data,image_path,title = "naive plot of data",clear_plot = False):
    fig, ax = plt.subplots()
    list_diff_data = list(diff_data.items())
    matrix = np.array([data.TJM20.to_numpy() for _,data in list_diff_data]).transpose()
    ax.set_prop_cycle('color',[plt.cm.Blues(i) for i in np.linspace(0, 1, len(list_diff_data))])
    for col_ind in range(len(list_diff_data)):
        # Get the column vector
        vector = matrix[:, col_ind]
    
        # Compute the angle for each element in the vector
        angles = range(len(vector))
    
        # Plot the line connecting the origin to each element in the vector
        ax.plot(angles, vector, label=f"station {list_diff_data[col_ind][0]}")

    ax.grid(True)

    #range_keys = np.linspace(0,len(diff_data.keys()),num = len(nibio_id.keys())+1)
    time_indexes = np.round(np.linspace(0, len(list_diff_data[0][1])-1, 12)).astype(int)
    ax.set_xticks(
        time_indexes,
        labels=np.array([d.strftime("%d-%m") for d in list(diff_data.values())[0].Time.to_numpy()])[time_indexes]
    )

    ax.set_title(title, va='bottom')
    plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
             rotation_mode="anchor")

    fig.set_size_inches(11.69, 8.27)
    plt.tight_layout()
    plt.savefig(image_path)
    if clear_plot:
        plt.clf()

In [9]:
from scipy.stats import zscore

#def outlier_check_single_jump_center(x):
#    real = x[1] # get target
#    test = (x[-1] + x[0])/2 # linear interpolation of point
#    test_diff = np.abs(x[-1] - x[0])/2
#    diff = np.abs(real - test)
#    if diff > 1.5*test_diff:
#        return True
#    return False

#def outlier_check_RR(x):
#    roll_data_right_2w = x.rolling(2*7*24,center = True)#
#
#    no_rain_2w = roll_data_right_2w.sum().to_numpy() < 1#mm
#    
#    greater_than_20 = x.to_numpy() > 20
#    less_than_0 = x.to_numpy() < 0#
#
#    return ((no_rain_2w + greater_than_20 + less_than_0) > 0).fillna(value = False)

def outlier_check(data,threshold,feature): #! må skrives om
    detrended_data = data - data.rolling(24,center = True).mean()
    big_jump_check = np.abs(zscore(detrended_data.to_numpy(), nan_policy='omit')) > threshold # simple outlier detection

#    single_jump = data.rolling(3,center = True).apply(outlier_check_single_jump_center,raw=True)

    change_wind = detrended_data.rolling(12).var() # looking at 12 hour intervall
    snitt_var = np.mean(change_wind)
    centered_changes = (change_wind - snitt_var).rolling(24).mean() # removes noise and centers
    epsilon = [0.001,50,-50]
    zero_change_check = change_wind.abs() < epsilon[0] # check for noe change
    growth_change_check = change_wind.diff() > epsilon[1] # check for increasing changes
    shrink_change_check = change_wind.diff() < epsilon[2] # check for decreasing changes

    #match feature:
    #    case "RR":
    #        feature_check = outlier_check_RR(data)
    #    case _:
    #        feature_check = np.repeat(False,data.shape[0])

    outlier_index = (zero_change_check 
                     + growth_change_check 
                     + shrink_change_check 
                     + big_jump_check 
                     #+ feature_check 
                     #+ single_jump
                    ) > 0 # combines and scales to 0 or 1
    return outlier_index.fillna(value = False)

def naive_plot_na_values(diff_data,image_path: str,title: str = "naive plot of data",clear_plot: bool = False, feature: str= "TJM20", _recurs = False,threshold = 4, indicator = None):
    """
        Data is:
            - {"station": [y0,y1,y2,...]}
        years in the same plot, station in different.

        consider:
            - Recursion if {"S1":[...],"S2":[...],...}
    """
    head, _,tail = image_path.rpartition(".")
    if len(diff_data)>1:
        for key in diff_data:
            new_title = "{}_k{}_f{}".format(title,key,feature)
            naive_plot_na_values({key:diff_data[key]},image_path=image_path,title = new_title,clear_plot = clear_plot,feature = feature, _recurs = True)
        return

    # after this point {"Station":[y0,y1,y2,...]}
    list_data = list(diff_data.items()) # [(station,[...])]
    years = list_data[0][1]
    n_years = len(years)
    sta_id = list_data[0][0]
        
    fig, ax = plt.subplots(nrows=n_years+1) # ax = [...]

    #matrix = np.array([data.loc[:,feature].to_numpy() for _,data in list_data[1]]).transpose() #? nyttig?

    first_year = years[0].Time[pd.Interval(0,5880).mid].year # pick an element that is garanteed to be in the year
    #print(sta_id)
    for i,(ax_i, colour_id) in enumerate(zip(ax[:-1],np.linspace(0, 1, n_years))):
        #print(first_year + i)
        # Get the column vector
        vec_data = years[i].loc[:, feature]
        vector = vec_data.to_numpy().ravel() # convert to vector

        na_pos = vec_data.isna().to_numpy()

        vector_inter = vec_data.bfill()

        outlier_index = outlier_check(vector_inter, threshold,feature)

        #na_indexes = years[i].loc[:, feature].isna().to_numpy()
    
        # Compute the angle for each element in the vector
        #angles = range(len(vector))
    
        # Plot the line connecting the origin to each element in the vector


        #limits = ax_i.get_ylim() # (bot,top)
        bottom_array = np.repeat(np.min(vector_inter),repeats=5880)
        top_array = np.repeat(np.max(vector_inter),repeats=5880)
        
        #vector_inter = vec_data.bfill()
        #vector_inter[np.logical_not(outlier_index)] = np.nan
        b_array = bottom_array.copy()
        t_array = top_array.copy()
        b_array[np.logical_not(outlier_index)] = np.nan
        t_array[np.logical_not(outlier_index)] = np.nan
        ax_i.fill_between(range(0,5880),t_array,b_array,color="yellow",linewidth=0)#,s = 0.07,marker = ".",linewidth=0,color = "yellow")

        ax_i.set_ylabel("y{}".format(first_year + i))
        ax_i.set_xticks([i for i in range(0,5880,500)],[i for i in range(0,5880,500)])
        ax_i.scatter(range(0,5880),vector,s = 0.07,marker = "*",linewidth=0)#,color = plt.cm.Blues(colour_id))
        ax[-1].scatter(range(0,5880),vector,s = 0.07,marker = "*",linewidth=0,color = plt.cm.plasma(colour_id),label = first_year + i)
        #vector_inter = vec_data.bfill()
        #vector_inter[np.logical_not(na_pos)] = np.nan
        b_array = bottom_array.copy()
        t_array = top_array.copy()
        b_array[np.logical_not(na_pos)] = np.nan
        t_array[np.logical_not(na_pos)] = np.nan
        ax_i.fill_between(range(0,5880),t_array,b_array,color = "red",linewidth=0)#,s = 0.07,marker = ".",linewidth=0,color = "red")

        # singular Nan-s
        sing_nan_index = np.where((na_pos[:-2] == False) & (na_pos[1:-1] == True) & (na_pos[2:] == False))[0] + 1
        x_coord = []
        y_min_coord = []
        y_max_coord = []
        for x in sing_nan_index:
            x_coord.extend([x-0.5,x+0.5,np.nan])
            y_min_coord.extend([bottom_array[0],bottom_array[0],np.nan])
            y_max_coord.extend([top_array[0],top_array[0],np.nan])
        ax_i.fill_between(x_coord,y_min_coord,y_max_coord,color = "red",linewidth=0)

        feature_na_data = terskel_data.loc[(terskel_data["station"] == int(sta_id)) & (terskel_data["year"] == (first_year + i)),[feature,small_f := "|{}|≤{}".format(feature,terskel),big_f := "|{}|>{}".format(feature,terskel)]].to_numpy().ravel() # pandas tabel

        y_min,y_max = ax_i.get_ylim()
        sec = ax_i.secondary_yaxis(location="right")
        sec.set_yticks(np.linspace(y_min,y_max,num=5)[1:-1],labels=["tot:{}h".format(feature_na_data[0]),"{}≥t:{}".format(terskel,feature_na_data[1]),"{}<t:{}".format(terskel,feature_na_data[2])])

        

    #fig.grid(True)

    #range_keys = np.linspace(0,len(diff_data.keys()),num = len(nibio_id.keys())+1)
    time_indexes = np.round(np.linspace(0,5880,12, endpoint=False)).astype(int)
    ax[-1].set_xticks(
        time_indexes,
        labels=np.array([d.strftime("%d-%m") for d in years[0].Time.to_numpy()])[time_indexes]
    )
    ax[-1].set_ylabel("overlap")
    #ax[-1].legend(loc = (1.3,1))

    ax[0].set_title(title, va='bottom')
    plt.setp(ax[-1].get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fig.set_size_inches(11.69, 8.27) # 
    plt.tight_layout()
    if _recurs:
        plt.savefig("{}_k{}_f{}.{}".format(head,list(diff_data.keys())[0],feature,tail))
    else:
        plt.savefig(image_path)
    if clear_plot:
        plt.clf()
    plt.close()

In [8]:
def naive_plot_cumsum_na_values(diff_data,image_path: str,title: str = "naive plot of data",clear_plot: bool = False, feature: str= "RR", _recurs = False,threshold = 4):
    """
        Data is:
            - {"station": [y0,y1,y2,...]}
        years in the same plot, station in different.

        consider:
            - Recursion if {"S1":[...],"S2":[...],...}
    """
    head, _,tail = image_path.rpartition(".")
    if len(diff_data)>1:
        for key in diff_data:
            new_title = "{}_k{}_f{}".format(title,key,feature)
            naive_plot_cumsum_na_values({key:diff_data[key]},image_path=image_path,title = new_title,clear_plot = clear_plot,feature = feature, _recurs = True)
        return

    # after this point {"Station":[y0,y1,y2,...]}
    list_data = list(diff_data.items()) # [(station,[...])]
    years = list_data[0][1]
    n_years = len(years)
    sta_id = list_data[0][0]
        
    fig, ax = plt.subplots(nrows=n_years+1) # ax = [...]

    #matrix = np.array([data.loc[:,feature].to_numpy() for _,data in list_data[1]]).transpose() #? nyttig?

    first_year = years[0].Time[pd.Interval(0,5880).mid].year # pick an element that is garanteed to be in the year
    
    for i,(ax_i, colour_id) in enumerate(zip(ax[:-1],np.linspace(0, 1, n_years))):
        # Get the column vector
        vec_data = years[i].loc[:, feature].cumsum()
        vector = vec_data.to_numpy().ravel() # convert to vector

        na_pos = vec_data.isna().to_numpy()

        vector_inter = vec_data.bfill()

        #na_indexes = years[i].loc[:, feature].isna().to_numpy()
    
        # Compute the angle for each element in the vector
        #angles = range(len(vector))
        bottom_array = np.repeat(np.min(vector_inter),repeats=5880)
        top_array = np.repeat(np.max(vector_inter),repeats=5880)
    
        # Plot the line connecting the origin to each element in the vector
        ax_i.set_ylabel("y{}".format(first_year + i))
        #ax_i.set_xticklabels([])
        ax_i.fill_between(range(0,5880),vector)#,s = 0.07,)#,color = plt.cm.Blues(colour_id))
        ax[-1].scatter(range(0,5880),vector,s = 0.07,marker = "*",linewidth=0,color = plt.cm.plasma(colour_id),label = first_year + i)
        bottom_array[np.logical_not(na_pos)] = np.nan
        top_array[np.logical_not(na_pos)] = np.nan
        ax_i.fill_between(range(0,5880),top_array,bottom_array,linewidth=0,color = "red")#(range(0,5880),vector_inter,s = 0.07,marker = ".",linewidth=0,color = "red")
        
        #vector_inter = vec_data.ffill()
        #outlier_index = outlier_check(vector_inter,threshold,feature)
        #vector_inter[np.logical_not(outlier_index)] = np.nan
        #ax_i.scatter(range(0,5880),vector_inter,s = 0.07,marker = ".",linewidth=0,color = "yellow")
        #ax_i.plot(np.arange(len(vector)), na_indexes,"r.")
        
        sing_nan_index = np.where((na_pos[:-2] == False) & (na_pos[1:-1] == True) & (na_pos[2:] == False))[0] + 1
        x_coord = []
        y_min_coord = []
        y_max_coord = []
        for x in sing_nan_index:
            x_coord.extend([x-0.5,x+0.5,np.nan])
            y_min_coord.extend([bottom_array[0],bottom_array[0],np.nan])
            y_max_coord.extend([top_array[0],top_array[0],np.nan])
        ax_i.fill_between(x_coord,y_min_coord,y_max_coord,color = "red",linewidth=0)
        feature_na_data = terskel_data.loc[(terskel_data["station"] == int(sta_id)) & (terskel_data["year"] == (first_year + i)),[feature,small_f := "|{}|≤{}".format(feature,terskel),big_f := "|{}|>{}".format(feature,terskel)]].to_numpy().ravel() # pandas tabel

        y_min,y_max = ax_i.get_ylim()
        sec = ax_i.secondary_yaxis(location="right")
        sec.set_yticks(np.linspace(y_min,y_max,num=5)[1:-1],labels=["tot:{}h".format(feature_na_data[0]),"{}≥t:{}".format(terskel,feature_na_data[1]),"{}<t:{}".format(terskel,feature_na_data[2])])

    #fig.grid(True)

    #range_keys = np.linspace(0,len(diff_data.keys()),num = len(nibio_id.keys())+1)
    time_indexes = np.round(np.linspace(0,5880,12, endpoint=False)).astype(int)
    ax[-1].set_xticks(
        time_indexes,
        labels=np.array([d.strftime("%d-%m") for d in years[0].Time.to_numpy()])[time_indexes]
    )
    ax[-1].set_ylabel("overlap")

    ax[0].set_title(title, va='bottom')
    plt.setp(ax[-1].get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fig.set_size_inches(11.69, 8.27) # 
    plt.tight_layout()
    if _recurs:
        plt.savefig("{}_k{}_f{}.{}".format(head,list(diff_data.keys())[0],feature,tail))
    else:
        plt.savefig(image_path)
    if clear_plot:
        plt.clf()
    plt.close()

In [10]:
all_data = imputed_nibio_data.shave_top_layer().flatten(return_key = True) # [(key, value)]
#print(all_data)
#diff_data = differense(*all_data)
#diff_data = dict(diff_data)
#diff_data = imputed_nibio_data[:]

In [11]:
for f in ["TJM20","TJM10","TM"]:
    naive_plot_na_values(dict(all_data),PLOT_PATH + "Plot_test_naive_nan.pdf",feature = f,title = "Untreated data")
naive_plot_cumsum_na_values(dict(all_data),PLOT_PATH + "Plot_test_naive_nan.pdf",title = "Untreated data")

## Imputing test

## Choose best method

In [4]:

nibio_data_ungroup = DFL.DataFileLoader(RESULT_PATH,r"weather_data_hour_stID(\d{1,3})_y(\d{4}).csv",_iter_key = True)
nibio_data_ungroup.load_data(names = ["Time","TM","RR","TJM10","TJM20"])
nibio_data = nibio_data_ungroup.group_layer(nibio_id)

nibio_data_raw_ungroup = DFL.DataFileLoader(RESULT_PATH,r"weather_data_raw_hour_stID(\d{1,3})_y(\d{4}).csv",_iter_key = True)
nibio_data_raw_ungroup.load_data(names = ["Time","TM","RR","TJM10","TJM20"])
nibio_data_raw = nibio_data_raw_ungroup.group_layer(nibio_id)

def dataframe_merge_func(x,y):
        y.iloc[y.iloc[:,1].notna() & (y.iloc[:,1] <= 0),2] = pd.NA
        x.iloc[0:y.shape[0],2] = y.iloc[0:y.shape[0],2]
        return x

imputed_nibio_data_cleaned = nibio_data.combine(nibio_data_raw,merge_func = dataframe_merge_func)

## Outlier detection

## Removes outliers

In [5]:
all_data = imputed_nibio_data_cleaned.shave_top_layer().flatten(return_key = True) # [(key, value)]

In [6]:
remove_pronto = {
    "15": {
        "TM": {
            2015: [[2293,2301]],
            2021: [[1230,1232]]
        },
        "TJM10": {
            2015: [[2293,2301]],
            2021: [[1230,1232]]
        },
        "TJM20": {
            2015: [[2293,2301]],
            2021: [[1230,1232]]
        }
    },
    "17": {
        "TJM10": {
            2014: [[0,5879]], # Fjerner hele året
            2015: [[0,5879]], # Fjerner hele året
            2016: [[0,2500]], # Fjerner halve året
            2020: [[5201,5879]],
            2021: [[0,5879]]
        }, 
        "TJM20": {
            2014: [[0,5879]], # Fjerner hele året
            2015: [[0,5879]], # Fjerner hele året
            2016: [[0,2500]], # Fjerner halve året
            2020: [[5201,5879]],
            2021: [[0,5879]]
        },
        "TM": {
            2014: [[0,5879]], # Fjerner hele året
            2015: [[0,5879]], # Fjerner hele året
            2016: [[0,2500]], # Fjerner halve året
            2020: [[5201,5879]],
            2021: [[0,5879]] # fjerner hele året
        }
    },
    "26": {
        "TJM10": {
            2017: [[5503,5823],[4622,4692]]
        }
    },
    "30": {
        "TJM10": {
            2016: [[200,1187]]
        },
        "TJM20": {
            2016: [[200,1187]]
        },
        "TM": {
            2016: [[200,1187]]
        }
    },
    "34": {
        "TM": {
            2021: [[2635,2649]],
            2022: [[2410,2433],[2768,2786]]
        },
        "TJM10": {
            2021: [[2635,2649]],
            2022: [[2410,2433],[2768,2786]]
        },
        "TJM20": {
            2021: [[2635,2649]],
            2022: [[2410,2433],[2768,2786]]
        }
    },
    "41": {
        "TJM20":{
            2014: [[1008,1453]],
            2016: [[3084,3228]],
            2019: [[3230,4018]],
            2020: [[5342,5350],[5646,5798]]
        },
        "TJM10":{
            2014: [[962,1453]],
            2016: [[3084,3228]],
            2019: [[3230,4018]],
            2020: [[5342,5350],[5646,5798]]}
    }, 
    "42": {
        "TM":{
            2019:[[2989,3138]]
        },
        "TJM20":{
            2019:[[2989,3138]]
        },    
        "TJM10":{
            2019:[[2989,3138]]
        }
    },
    "50": {
        "TJM20":{
            2014:[[2085,2110],[3939,3965],[5785,5879]],
            2017:[[3699,4334]]
        },
        "TJM10":{
            2014:[[2085,2110],[3939,3965],[5785,5879]]
        },
        "TM":{
            2014:[[2085,2110],[3939,3965],[5785,5879]]
        },
    },
    "52": {
        "TJM20":{
            2017:[[3786,3856]]
        },    
        "TJM10":{
            2017:[[3786,3856]]
        },    
        "TM":{
            2017:[[3786,3856]]
        }
    }
}

all_data_dict = dict(all_data)
clone_all_data_dict = all_data_dict.copy()

for station in all_data_dict:
    for ind,_ in enumerate(all_data_dict[station]):
        for feat in all_data_dict[station][ind].columns:
            all_data_dict[station][ind].loc[:,feat] = all_data_dict[station][ind].loc[:,feat].infer_objects(copy=False).interpolate(limit= 3 if feat == "TM" else 5,limit_direction="forward") # bare første ende

for station in remove_pronto:
    for feat in remove_pronto[station]:
        for yr in remove_pronto[station][feat]:
            for inter in remove_pronto[station][feat][yr]:
                all_data_dict[station][yr - 2014].loc[inter[0]:inter[1],feat] = np.nan

for station in all_data_dict: 
    current_station = {}
    for year,data in enumerate(all_data_dict[station]): 
        current_station[str(2014+year)] = data
    clone_all_data_dict[station] = current_station

In [12]:
for f in ["TJM20","TJM10","TM"]:
    naive_plot_na_values(all_data_dict,PLOT_PATH + "Plot_test_naive_nan_treated.pdf",feature = f,title = "treated data")
naive_plot_cumsum_na_values(all_data_dict,PLOT_PATH + "Plot_test_naive_nan_treated.pdf",title = "treated data")

NameError: name 'naive_plot_na_values' is not defined

## Sending treated data

In [7]:
imputed_data = DFL.DataFileLoader().load_dict(clone_all_data_dict).group_layer(nibio_id)

imputed_data.dump(METADATA_PRELOAD_DATA_PATH + "weatherdata_cleaned.bin")

In [11]:
for station in clone_all_data_dict.keys():
    for yt,year in clone_all_data_dict[station].items():
        year.to_csv(AUGME_PATH + "weather_data_hour_stID{}_y{}.csv".format(station,yt), sep=",", encoding='utf-8')