In [None]:
import pandas as pd
import peakutils
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import datetime

In [None]:
def outlier(df, margin):
    std = df.std()
    margin_abs = std * margin
    prev = df.shift(1)
    next = df.shift(-1)
    df[((df-prev) >  margin_abs) & ((df-next) > margin_abs)] = np.nan
    df[((prev-df) >  margin_abs) & ((next-df) > margin_abs)] = np.nan
    return df


In [None]:
def check_and_drop_day(data, gap_size):
    
    # add a new column with the time differences between consecutive rows
    data['time_diff'] = data.index.to_series().diff().dt.total_seconds().fillna(0)
    
    # get the dates to drop
    dates_to_drop = data[data['time_diff'] > gap_size].index.date

    # create DatetimeIndex object from the array of dates
    dates_to_drop = pd.DatetimeIndex(dates_to_drop)

    # drop the rows with the dates to drop
    data = data.loc[~data.index.isin(dates_to_drop)]

    # drop the time_diff column
    data = data.drop('time_diff', axis=1, level=0)
    
    return data

In [None]:
def clean_house_data(appliance_data, gap_size, house_ids):
    # List comprehension to create separate dataframes for each house
    house_dfs = [onehouse(appliance_data, i) for i in house_ids]

    # Apply the dropdate() function to each dataframe
    for i, house_df in enumerate(house_dfs):
        house_df.reset_index()
        print(f"House {i+1} before cleaning:")
        print(house_df.isna().sum())
        house_df = check_and_drop_day(house_df, gap_size)
        house_df = outlier(house_df, 1.5)
        house_dfs[i] = house_df
        print(f"House {i+1} after cleaning:")
        print(house_df.isna().sum())

    # Name the dataframes
    for i, house_df in enumerate(house_dfs):
        globals()[f"house_{i+1}"] = house_df.copy()
    
    return house_dfs

# Define the gap size threshold in seconds
gap_size = 60

# List of house IDs
house_ids = [1, 2, 3, 4, 5, 6]

# Call the function to clean the data
cleaned_house_dfs = clean_house_data(appliance_data, gap_size, house_ids)

# Print the number of NaN values for each house before and after cleaning
for i, house_df in enumerate(cleaned_house_dfs):
    print(f"House {i+1} before cleaning:")
    print(house_df.isna().sum().sum())
    print(f"House {i+1} after cleaning:")
    print(house_df.isna().sum().sum())


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# calculate FFT of mains signal for each 5-minute interval
fft_data = []
for i in range(len(mains_data)//(5*60)):
    fft = np.fft.fft(mains_data[i*5*60:(i+1)*5*60])
    fft_data.append(np.abs(fft[:len(fft)//2]))

# create 2D array of FFT magnitudes
fft_data = np.array(fft_data)
num_freq_bins = fft_data.shape[1]

# reshape to 3D array
fft_data = fft_data.reshape((1, fft_data.shape[0], num_freq_bins))

# define CNN model
model = tf.keras.Sequential([
    layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=(fft_data.shape[1], fft_data.shape[2])),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

# compile and train model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(fft_data, labels, epochs=10, batch_size=1)


In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def prepare_lstm_input(float_series, label_series, sequence_length, train_ratio=0.8):
    # Combine float and label series
    combined_data = np.hstack((float_series, label_series))

    # Normalize the float series
    scaler = MinMaxScaler()
    combined_data[:, :float_series.shape[1]] = scaler.fit_transform(float_series)

    # Split the combined data into input sequences and target labels
    num_samples = len(combined_data) - sequence_length + 1
    sequences = []
    targets = []
    for i in range(num_samples):
        sequences.append(combined_data[i:i+sequence_length, :float_series.shape[1]])
        targets.append(combined_data[i+sequence_length-1, -1])

    # Convert sequences and targets to numpy arrays
    sequences = np.array(sequences)
    targets = np.array(targets).reshape(-1, 1)

    # Split the data into train and test sets
    split_index = int(train_ratio * num_samples)
    x_train, y_train = sequences[:split_index], targets[:split_index]
    x_test, y_test = sequences[split_index:], targets[split_index:]

    return x_train, y_train, x_test, y_test, scaler


In [None]:
# Train the LSTM model
seq_length = 15
i = 1               #target column, 1 in curent form
train_size = 0.75
val_size = 0.75



x_train, y_train, x_val, y_val, x_test, y_test, test_dt, train_dt, val_dt, scaler = split_data(pd.merge(y,x, left_index=True, right_index=True), i, seq_length, train_size, val_size)

In [None]:
# Define the LSTM model architecture
model = Sequential()
model.add(LSTM(32, input_shape=(seq_length, x_train.shape[2]), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # For binary classification, use sigmoid activation

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy')  # Use binary cross-entropy for binary classification

# Set early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')

# Train the model
history = model.fit(x_train, y_train, epochs=100, batch_size=64, validation_data=(x_val, y_val), callbacks=[early_stopping])


In [None]:
# Generate predictions on the test set
y_pred = model.predict(x_test)

# reshape y_test to have shape (16653,)
y_test = y_test[:, 0]

# Reshape y_test and y_pred to 2D tensors
y_test_2d = np.reshape(y_test, (-1, 1))
y_pred_2d = np.reshape(y_pred, (-1, 1))


#inverse transform
y_test_trans = scaler.inverse_transform(y_test_2d)
y_pred_trans = scaler.inverse_transform(y_pred_2d)

# Calculate mean squared error
mse = np.mean(np.square(y_pred - y_test))

print("Mean squared error:", mse)


# Plot predicted values against true values
plt.scatter(y_test_2d, y_pred_2d)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()

In [None]:
def model_loss(history):
   plt.figure(figsize=(8,4))
   plt.plot(history.history['loss'], label='Train Loss')
   plt.plot(history.history['val_loss'], label='Test Loss')
   plt.title('model loss')
   plt.ylabel('loss')
   plt.xlabel('epochs')
   plt.legend(loc='upper right')
   plt.show()



In [None]:
train_score = model.evaluate(x_train, y_train, verbose=0)
print('Train Root Mean Squared Error(RMSE): %.2f; Train Mean Absolute Error(MAE) : %.2f ' 
    % (np.sqrt(train_score), train_score))

test_score = model.evaluate(x_test, y_test, verbose=0)
print('Test Root Mean Squared Error(RMSE): %.2f; Test Mean Absolute Error(MAE) : %.2f ' 
    % (np.sqrt(test_score), test_score))
model_loss(history)

In [None]:
def prediction_plot(y_test, test_predict, dt):
   # Convert index to datetime objects
   dt = pd.to_datetime(dt)
   fig, ax = plt.subplots()
   ax.plot(dt, y_test[:], marker='.', label="actual")
   ax.plot(dt, test_predict[:], 'r', label="prediction")
   ax.set_ylabel('Ads Daily Spend', size=15)
   ax.set_xlabel('Date', size=15)

   # Choose 5 ticks on the x-axis
   n_ticks = 5
   tick_locs = np.linspace(0, len(dt)-1, n_ticks, dtype=int)
   ax.set_xticks(dt[tick_locs])
   ax.set_xticklabels(dt[tick_locs].strftime('%Y-%m-%d'), rotation=45)
   ax.legend(fontsize=15)
   plt.show()


In [None]:
prediction_plot(y_test,y_pred, test_dt)