In [15]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the two CSV files

home_data = pd.read_csv('./Data/Home_Data/combined_sensor_dataset_latest.csv')
# df2 = pd.read_csv('./Data/Prof_Office_Data/office_actuator_records.csv')

#original_df = df1.copy() #Keep a copy to compare
## Toggle the values of Light Sensor.
home_data.loc[home_data['sensor_name'] == 'Light Sensor', 'state'] = 1 - home_data['state']

# Change the continuous values to 1. We only need to check if the sensor is active or not
home_data.loc[home_data['sensor_name'].isin(['Kitchen Humidity', 'Kitchen Temperature', 'Bedroom Humidity', 'Bedroom Temperature', 'Washroom Temperature', 'Washroom Humidity']), 'state'] = 1

# Convert the 'datetime' column to a datetime data type if necessary
home_data['datetime'] = pd.to_datetime(home_data['seconds'], unit='s')
# combined_office_data['datetime'] = pd.to_datetime(combined_office_data['date'] + ' ' + combined_office_data['time'], format='%d/%m/%y %H:%M')

# Sort by the 'datetime' column
home_data = home_data.sort_values(by='datetime')

# Display the sorted, combined DataFrame
print(home_data.head)
unique_sensor = home_data.sensor_name.unique()
print(unique_sensor)

  home_data = pd.read_csv('./Data/Home_Data/combined_sensor_dataset_latest.csv')


<bound method NDFrame.head of              date   time       seconds  state          sensor_name  \
0        08/05/23  15:26  1.691267e+09    1.0     Kitchen Humidity   
1        08/05/23  15:26  1.691267e+09    1.0  Kitchen Temperature   
2        08/05/23  15:27  1.691267e+09    1.0     Kitchen Humidity   
3        08/05/23  15:30  1.691267e+09    1.0     Kitchen Humidity   
4        08/05/23  15:31  1.691268e+09    1.0     Kitchen Humidity   
...           ...    ...           ...    ...                  ...   
6289486  06/20/24  07:56  1.718888e+09    0.0  Motion Outside Room   
6289487  06/20/24  07:56  1.718888e+09    1.0  Motion Outside Room   
6289488  06/20/24  07:56  1.718888e+09    0.0  Motion Outside Room   
6289489  06/20/24  07:56  1.718888e+09    1.0  Motion Outside Room   
6289490  06/20/24  07:56  1.718888e+09    0.0  Motion Outside Room   

              thing_name      thing_ip thing_ip0                      datetime  
0              KitchenPi  192.168.4.36       NaN

In [16]:
from sklearn.ensemble import IsolationForest

def anomaly_detection_and_removal(df, timeWindow = 'h'):
    df = df.set_index('datetime') #make the date column as index

    # combined_office_data_filtered = combined_office_data.between_time('07:00', '19:00')
    # print(combined_office_data.size, combined_office_data_filtered.size)
    df_hourly = df.state.resample(timeWindow).sum() #resample on hourly basis and get summary of sensor values on 2 hours.

    # df_hourly = df_hourly.between_time('09:00', '18:00')
    df_hourly = df_hourly.reset_index()
    
    values = df_hourly[['state']]  # Selecting the 'values' column as input for the model

    # Initialize and fit the IsolationForest model
    model = IsolationForest(contamination='auto', random_state=42)
    df_hourly['anomaly'] = model.fit_predict(values)

    # Isolation Forest outputs:
    # -1 for anomalies
    #  1 for normal data points

    # Filter the anomalies
    # anomalies = lab_hourly[lab_hourly['anomaly'] == -1]
    regular = df_hourly[df_hourly['anomaly'] == 1]
    

    regular_date = regular.set_index('datetime')
    regular_idx = regular_date.index
    # print(regular_idx)

    # Filtering out the hourly anomalous data from the original data
    filtered_df = df[df.index.floor(timeWindow).isin(regular_idx)].reset_index()
    return filtered_df

In [17]:
#FNE and TD implementations
from math import ceil
from sklearn.cluster import SpectralClustering
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from neighbor_group import *
from sklearn.manifold import SpectralEmbedding

def frequent_next_event(df, adjacency_matrix):
    prev_row = df.iloc[0]
    for _,cur_row in df.iterrows():
        if (cur_row.sensor_name != prev_row.sensor_name):
            t = max(ceil(cur_row.seconds - prev_row.seconds), 1.0)
            # t = max(ceil((cur_row.DateTime - prev_row.DateTime).total_seconds()),1.0)
            adjacency_matrix[prev_row.sensor_name][cur_row.sensor_name] += 1/t
        prev_row = cur_row
    
    return adjacency_matrix

def time_delta(df, adjacency_matrix, th=2):
    cur_group = []
    prev_row = df.iloc[0]
    for _, cur_row in df.iterrows():
        if ((cur_row.seconds - prev_row.seconds) >= th):
            cur_group = []
        if not(cur_row.sensor_name in cur_group):
            cur_group.append(cur_row.sensor_name)
        for item in cur_group:
            adjacency_matrix[item][cur_row.sensor_name] += 1
        prev_row = cur_row
    
    return adjacency_matrix


def get_sensor_groups(adjacency_matrix):
    # Fixed cluster number
    n = 2
    max_ch = 0
    stop_iter = 10
    cluster_score_dict = {}
    patience = 0

    spectral_clustering = SpectralClustering(3, affinity="precomputed")
    cluster = spectral_clustering.fit_predict(adjacency_matrix)
    embedding = SpectralEmbedding(n_components=2, affinity='precomputed')
        
    features = embedding.fit_transform(adjacency_matrix)

    ch_score = calinski_harabasz_score(features, cluster)
    silhouette_avg = silhouette_score(features, cluster)
    db_score = davies_bouldin_score(features, cluster)

    cluster_score_dict['cluster_number'] = 3
    cluster_score_dict['ch-score'] = ch_score
    cluster_score_dict['silhoutte-score'] = silhouette_avg
    cluster_score_dict['db-score'] = db_score
    cluster_score_dict['cluster'] = cluster
    print(cluster_score_dict)

    cluster_dict = get_cluster_sensor_list(cluster_score_dict['cluster'], adjacency_matrix)
    adjacency_matrix_list, unique_sensors = get_adjacency_matrix_list(cluster_dict, adjacency_matrix)

    # Get groups based on fixed nearest nodes
    sensor_group = {}
    sensor_number_each_group = 3  # Select the sensor numbers that will used to pick top nearest nodes
    i = 1 
    for index, matrix in enumerate(adjacency_matrix_list):
        groups = get_groups(matrix, sensor_number_each_group, unique_sensors[index])
        for group in groups:
            item_list = []
            for item in group:
                item_list.append(item)
            sensor_group['G'+ str(i)] = item_list
            i += 1

    # sensor_group_df = pd.DataFrame(sensor_group)
    print(sensor_group)
    return cluster_score_dict

In [18]:
unique_sensor = home_data.sensor_name.unique()

fnq_adjacency_matrix = pd.DataFrame(0.0, columns=unique_sensor, index=unique_sensor)
td_adjacency_matrix = pd.DataFrame(0.0, columns=unique_sensor, index=unique_sensor)

print('Frequent next event starts')
print('---------------------------')
fnq_adjacency_matrix = frequent_next_event(home_data, fnq_adjacency_matrix)
print('---------------------------')
print('Frequent next event ends')

print('Time Delta starts')
print('---------------------------')
# td_adjacency_matrix = time_delta(home_data, td_adjacency_matrix)
print('---------------------------')
print('Time Delta ends')

fne_scores = get_sensor_groups(adjacency_matrix=fnq_adjacency_matrix)

print("~~~ Time Delta results ~~~")

# td_scores = get_sensor_groups(td_adjacency_matrix)

# FNE
#{'cluster_number': 2, 'ch-score': 22.25262816159003, 'silhoutte-score': 0.6379307129426132, 'db-score': 0.4079323384378, 'cluster': array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 1])}

#TD
#{'cluster_number': 2, 'ch-score': 23.354525308582886, 'silhoutte-score': 0.5507230369751293, 'db-score': 0.5885350572305312, 'cluster': array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 0])}

#FNE
#{'cluster_number': 3, 'ch-score': 62.2086134386679, 'silhoutte-score': 0.7269994505079088, 'db-score': 0.29789624746318877, 'cluster': array([0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    #    0, 1])}

#TD
# {'cluster_number': 3, 'ch-score': 50.57308549964048, 'silhoutte-score': 0.5893114768248414, 'db-score': 0.5028462040428648, 'cluster': array([0, 0, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
#        1, 2])}

Frequent next event starts
---------------------------


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adjacency_matrix[prev_row.sensor_name][cur_row.sensor_name] += 1/t


---------------------------
Frequent next event ends
Time Delta starts
---------------------------
---------------------------
Time Delta ends
{'cluster_number': 3, 'ch-score': 62.20861343866787, 'silhoutte-score': 0.7269994505079085, 'db-score': 0.29789624746318927, 'cluster': array([0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1])}
['Kitchen Humidity', 'Kitchen Temperature', 'Entrance Motion', 'Entrance Door', 'Motion Outside Room', 'Motion Inside Room(East Corner)', 'Room Door', 'Closet Door', 'Closet Light', 'Desk Right Sonar', 'Desk Right Motion', 'Desk Left Motion', 'Desk Left Sonar', 'Desk Left Light', 'Kitchen Light', 'Kitchen Motion', 'Washroom Door', 'Washroom Motion']
['Bedroom Humidity', 'Motion Inside Room(West Corner)', 'Bedroom Temperature', 'Bedroom Light']
['Washroom Temperature', 'Washroom Humidity']
number of unique group  13
('Entrance Door', 'Entrance Motion', 'Kitchen Motion')
('Desk Left Motion', 'Desk Left Sonar', 'Desk Right Sonar

  adjacency = check_symmetric(adjacency)
  adjacency = check_symmetric(adjacency)


In [20]:
unique_sensor = home_data.sensor_name.unique()

fnq_adjacency_matrix_filtered = pd.DataFrame(0.0, columns=unique_sensor, index=unique_sensor)
td_adjacency_matrix_filtered = pd.DataFrame(0.0, columns=unique_sensor, index=unique_sensor)

# filtered data after detecting and removing anomaly based on 1 hour data partition
fd = anomaly_detection_and_removal(home_data, '4h')
print('Frequent next event starts')
print('---------------------------')
fnq_adjacency_matrix_filtered = frequent_next_event(fd, fnq_adjacency_matrix_filtered)
print('---------------------------')
print('Frequent next event ends')

print('Time Delta starts')
print('---------------------------')
# td_adjacency_matrix_filtered = time_delta(fd, td_adjacency_matrix_filtered)
print('---------------------------')
print('Time Delta ends')

fne_scores_filtered = get_sensor_groups(adjacency_matrix=fnq_adjacency_matrix_filtered)

print("~~~ Time Delta results ~~~")

# td_scores_filtered = get_sensor_groups(td_adjacency_matrix_filtered)

Frequent next event starts
---------------------------


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adjacency_matrix[prev_row.sensor_name][cur_row.sensor_name] += 1/t


---------------------------
Frequent next event ends
Time Delta starts
---------------------------
---------------------------
Time Delta ends
{'cluster_number': 3, 'ch-score': 43.295352171348284, 'silhoutte-score': 0.6044818314977246, 'db-score': 0.4837161590397774, 'cluster': array([2, 2, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1])}
['Kitchen Humidity', 'Kitchen Temperature', 'Washroom Temperature', 'Washroom Humidity']
['Bedroom Humidity', 'Motion Inside Room(West Corner)', 'Bedroom Temperature', 'Closet Light', 'Bedroom Light']
['Entrance Motion', 'Entrance Door', 'Motion Outside Room', 'Motion Inside Room(East Corner)', 'Room Door', 'Closet Door', 'Desk Right Sonar', 'Desk Right Motion', 'Desk Left Motion', 'Desk Left Sonar', 'Desk Left Light', 'Kitchen Light', 'Kitchen Motion', 'Washroom Door', 'Washroom Motion']
number of unique group  2
('Kitchen Humidity', 'Kitchen Temperature', 'Washroom Humidity')
('Kitchen Humidity', 'Washroom Humidity', 'Washr

  adjacency = check_symmetric(adjacency)
  adjacency = check_symmetric(adjacency)
