In [76]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import pickle
import copy
import frogress

missing_attributes = np.asarray(['wind_speed', 'power', 'rotor_speed', 'generator_speed', 'temp_environment', 'temp_hydraulic_oil', 'temp_gear_bearing', 'blade_angle_avg'])


# SET TO FALSE
debug = False
iterations = 3

In [72]:
# laod in data
data_miss=pd.read_csv("../raw_data/hiddenset.csv")

# load xgboosted trees
xgbs = {}
for at in missing_attributes:
    with open(f'final_missing_{at}.pkl', 'rb') as f:
        xgbs[at] = pickle.load(f)

In [73]:
# build spatial correlator
df_miss_ = copy.deepcopy(data_miss)
times = pd.to_datetime(df_miss_["measured_at"]).view(int)/ 10**9 / 60. # in minutes
time_zero_point = np.min(times)
times = times - time_zero_point
df_miss_['date'] = times
times = np.unique(np.asarray(times, dtype=int))
print(f"Got {len(times)} different times")

# calculate the averages of the attributes for each park and time    

# initialize
averages = {}
for park_id in [1, 2]:
    print(f"Building partial interpolator for park {park_id}")
    idx = np.where(df_miss_["park_id"] == park_id)[0]
    df_park = df_miss_.iloc[idx]
    averages[park_id] = np.zeros((len(times), len(missing_attributes)))
    for idt, time in frogress.bar(enumerate(times)):
        idx_time = np.where(df_park['date'] == time)[0]
        if len(idx_time) == 0:
            averages[park_id][idt] = np.full((len(missing_attributes,)), np.nan)
        else:
            averages[park_id][idt] = np.nanmean(df_park[missing_attributes].iloc[idx_time], axis=0)
        if debug:
            if idt > 1000:
                break


Got 35143 different times
Building partial interpolator for park 1
[.....#....] | Progress: 1579 | Time: 9.3s | ETA: --

  averages[park_id][idt] = np.nanmean(df_park[missing_attributes].iloc[idx_time], axis=0)


[##########] | Progress: 35143 | Time: 3min24s | ETA: --Building partial interpolator for park 2
[##########] | Progress: 35143 | Time: 6min59s | ETA: --

In [74]:
with open(f'averages.pkl', 'wb+') as f:
    pickle.dump(averages, f)

In [77]:
# fill in first guesses
data = copy.deepcopy(data_miss)
print(f"Got {data.shape[0]} rows")
for ix, x in frogress.bar(data.iterrows()):
    x = x.to_frame().T
    x = x.astype({'turbine_id' : int, 'power' : float, 'temp_environment' : float, 'temp_hydraulic_oil' : float, 'temp_gear_bearing': float, 'cosphi': float, 'blade_angle_avg': float, 'hydraulic_pressure': float, 'park_id': int, 'rotor_speed': float, 'generator_speed' : float, 'nacelle_direction':float, 'wind_speed': float, 'wind_direction': float})

    # get missing attributes
    check_nans = x.isna()
    check_nans = np.asarray(check_nans).reshape(-1)
    missing = list(x.keys()[check_nans])
    if len(missing) == 0:
        continue
    time = pd.to_datetime(x["measured_at"]).view(int) / 10**9 / 60. # in minutes
    time = time - time_zero_point
    park = x['park_id']

    avg = averages[int(park)][np.where(np.asarray(times) == int(time))[0][0], :]
    # replace with initial guess as averages of windpark
    for at in missing:
        x[at] = avg[np.where(missing_attributes == at)[0]]

    # preprocessing
    temp=pd.to_datetime(x.measured_at)
    x["week"]=temp.dt.isocalendar().week.astype(int)
    x["month"]=temp.dt.month
    x["hourofday"]=temp.dt.hour
    x["isnight"]=(x.hourofday >= 18) | (x.hourofday <=5)
    x["isnoon"]=(x.hourofday >= 7) & (x.hourofday<=14)
    # x["Error"]=x.error_category != "NO_ERROR"
    x["speed"]=(x.rotor_speed+x.generator_speed)
    x["direction"]=(x.nacelle_direction+x.wind_direction)

    # run xgbs
    for iter in range(iterations):
        for at in missing:
            x[at]= xgbs[at].predict(x.drop(columns=[at, 'measured_at', "index", 'nacelle_direction', 'wind_direction', 'rotor_speed', 'generator_speed']))

    for at in missing:
        data[at][ix] = x[at]

Got 454744 rows
[.......#..] | Progress: 42 | Time: 0.8s | ETA: --

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[at][ix] = x[at]


[........#.] | Progress: 4739 | Time: 2min48s | ETA: --

KeyboardInterrupt: 

In [99]:
data.to_csv('missing_solution.csv')