In [23]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the two CSV files
# df1 = pd.read_csv('./Data/Prof_Office_Data/office_group_records_07_02.csv')

df1 = pd.read_csv('./Data/Prof_Office_Data/office_group_records_all.csv')
# df2 = pd.read_csv('./Data/Prof_Office_Data/office_actuator_records.csv')

#original_df = df1.copy() #Keep a copy to compare
## Toggle the values of Light Sensor.
df1.loc[df1['sensor_name'] == 'Light Sensor', 'state'] = 1 - df1['state']

# # Find rows where 'state' has changed
# changed_rows = df1[df1['state'] != original_df['state']]

# # Display only the changed rows
# print(changed_rows)

#remove the motion sensor data
# filtered_df1 = df1[df1['sensor_name'] != 'Motion Sensor']
# print(filtered_df1)


# Combine the two DataFrames (concatenating rows)
# combined_office_data = pd.concat([df1, df2], ignore_index=True)
combined_office_data = df1

# Convert the 'datetime' column to a datetime data type if necessary
combined_office_data['datetime'] = pd.to_datetime(combined_office_data['seconds'], unit='s')
# combined_office_data['datetime'] = pd.to_datetime(combined_office_data['date'] + ' ' + combined_office_data['time'], format='%d/%m/%y %H:%M')

# Sort by the 'datetime' column
combined_office_data = combined_office_data.sort_values(by='datetime')

# Display the sorted, combined DataFrame
# print(combined_office_data.head)
unique_items = combined_office_data['sensor_name'].unique()
print(unique_items)
print(combined_office_data.shape)

['Office Door Sensor' 'Office Door Motion Sensor' 'Desk Motion Sensor'
 'Office Light Sensor']
(121535, 8)


In [2]:
from sklearn.ensemble import IsolationForest

def anomaly_detection_and_removal(df, timeWindow = 'h'):
    df = df.set_index('datetime') #make the date column as index

    # combined_office_data_filtered = combined_office_data.between_time('07:00', '19:00')
    # print(combined_office_data.size, combined_office_data_filtered.size)
    df_hourly = df.state.resample(timeWindow).sum() #resample on hourly basis and get summary of sensor values on 2 hours.

    df_hourly = df_hourly.between_time('08:00', '18:00')
    df_hourly = df_hourly.reset_index()
    
    values = df_hourly[['state']]  # Selecting the 'values' column as input for the model

    # Initialize and fit the IsolationForest model
    model = IsolationForest(contamination='auto', random_state=42)
    df_hourly['anomaly'] = model.fit_predict(values)

    # Isolation Forest outputs:
    # -1 for anomalies
    #  1 for normal data points

    # Filter the anomalies
    # anomalies = lab_hourly[lab_hourly['anomaly'] == -1]
    regular = df_hourly[df_hourly['anomaly'] == 1]
    

    regular_date = regular.set_index('datetime')
    regular_idx = regular_date.index
    # print(regular_idx)

    # Filtering out the hourly anomalous data from the original data
    filtered_df = df[df.index.floor(timeWindow).isin(regular_idx)].reset_index()
    return filtered_df

In [12]:
from math import ceil
from sklearn.cluster import SpectralClustering
from sklearn.manifold import SpectralEmbedding
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score


def get_adjMat_FNE(data):
    u = data.sensor_name.unique()
    adjMatrix = pd.DataFrame(0.0, columns=u, index=u)
    prevRow = data.iloc[0]
    for _,curRow in data.iterrows():
        if (curRow.sensor_name != prevRow.sensor_name):
            t = max(ceil(curRow.seconds - prevRow.seconds), 1.0)
            # t = max(ceil((curRow.DateTime - prevRow.DateTime).total_seconds()),1.0)
            adjMatrix[prevRow.sensor_name][curRow.sensor_name] += 1/t
        prevRow = curRow
    
    return adjMatrix

def get_adjMat_TD(data):
    u = data.sensor_name.unique()
    adjMatrix = pd.DataFrame(0.0, columns=u, index=u)
    curGroup = []
    th = 2
    prevRow = data.iloc[0]
    for _, curRow in data.iterrows():
        if ((curRow.seconds - prevRow.seconds) >= th):
            curGroup = []
        if not(curRow.sensor_name in curGroup):
            curGroup.append(curRow.sensor_name)
        for item in curGroup:
            adjMatrix[item][curRow.sensor_name] += 1
        prevRow = curRow
    
    return adjMatrix

def normalize(df_adj):
    r=0
    norm = df_adj.copy()
    for index, row in norm.iterrows():
        mx = 0
        mn = 0
        c = 0
        for value in row:
            mx = max(mx, value)
            mn = min(mn, value)
            if not mx-mn == 0:
                norm.iloc[r,c] = (value-mn)/(mx-mn)
            c += 1
        r += 1
    return norm

def get_clusters_and_scores(adjMat, n_clusters):
    #normalize matrix
    adjMat = normalize(adjMat)
    sp = SpectralClustering(n_clusters,affinity="precomputed")
    clus = sp.fit_predict(adjMat)
    display(pd.DataFrame(clus,index=adjMat.index))

    embedding = SpectralEmbedding(n_components=2, affinity='precomputed')    
    features = embedding.fit_transform(adjMat)

    cluster_score_dict = {}
    ch_score = calinski_harabasz_score(features, clus)
    silhouette_avg = silhouette_score(features, clus)
    db_score = davies_bouldin_score(features, clus)

    cluster_score_dict['cluster_number'] = n_clusters
    cluster_score_dict['ch-score'] = ch_score
    cluster_score_dict['silhoutte-score'] = silhouette_avg
    cluster_score_dict['db-score'] = db_score
    cluster_score_dict['cluster'] = clus
    print(cluster_score_dict)

In [14]:
adjMat_FNE = get_adjMat_FNE(combined_office_data)
get_clusters_and_scores(adjMat_FNE,2)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adjMatrix[prevRow.sensor_name][curRow.sensor_name] += 1/t
  adjacency = check_symmetric(adjacency)


Unnamed: 0,0
Office Door Sensor,0
Office Door Motion Sensor,0
Desk Motion Sensor,0
Office Light Sensor,1


{'cluster_number': 2, 'ch-score': 2.518521120058155, 'silhoutte-score': 0.2280208548149661, 'db-score': 0.430312430250712, 'cluster': array([0, 0, 0, 1])}


  adjacency = check_symmetric(adjacency)


In [None]:
fd = anomaly_detection_and_removal(combined_office_data, '4h')
print(fd.shape)
adjMat_FNE_Filtered = get_adjMat_FNE(fd)
get_clusters_and_scores(adjMat_FNE_Filtered,2)

(919, 8)


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adjMatrix[prevRow.sensor_name][curRow.sensor_name] += 1/t
  adjacency = check_symmetric(adjacency)


Unnamed: 0,0
Office Door Sensor,0
Office Door Motion Sensor,0
Office Light Sensor,1


{'cluster_number': 2, 'ch-score': 1.7204743874212625, 'silhoutte-score': 0.12849319496915165, 'db-score': 0.4401647571661937, 'cluster': array([0, 0, 1])}


  adjacency = check_symmetric(adjacency)
  _, diffusion_map = eigsh(


: 