In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Baseline Clustering to Classification on Error Data

In [2]:
# read data
PATH = "../../my_data/identification-dataset/my_custom_data/error-identification-dataset.csv"
df = pd.read_csv(PATH)

In [3]:
# drop uncomplete rows
df.dropna(inplace=True)

# save the labels
y_old = df['material']

# drop categorical columns
df.drop('encoded_material', axis=1, inplace=True)
df.drop('material', axis=1, inplace=True)

df.describe()

Unnamed: 0,tdoa12,tdoa21,snr_an1,power_dif_an1,snr_an2,power_dif_an2,an1_rx_snr,an1_rx_powerdif,an1_tof,an2_rx_snr,an2_rx_powerdif,an2_tof,error
count,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0,59871.0
mean,-0.26155,0.243546,108.807438,13.347087,132.862162,12.379578,203.198468,11.113611,5.044038,194.427478,10.951227,5.043107,-0.094614
std,0.210522,2.164604,50.282204,5.176766,19.046488,1.294191,31.160414,1.289327,0.022116,30.615709,1.200196,0.022047,0.210513
min,-9.580582,-373.351807,6.208333,-17.027435,9.0,-6.97921,4.407227,-31.256706,4.95751,0.118671,-35.275032,4.952817,-9.41346
25%,-0.314348,0.15952,78.25,10.623871,120.85714,12.025036,181.545837,10.322666,5.027886,174.125,10.293266,5.027886,-0.146944
50%,-0.211129,0.211129,119.0,11.271515,132.392853,12.568062,202.666672,10.793015,5.041961,194.583328,10.737564,5.041961,-0.044637
75%,-0.15952,0.314348,141.142853,14.279049,145.541672,12.895836,227.149994,11.47039,5.060728,217.5,11.251774,5.056036,0.007467
max,0.286198,4.077143,326.200012,31.497238,214.875,32.832855,330.0,15.155426,5.14518,314.875,36.156631,5.14518,0.45303


In [4]:
# Perform K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(df)

KMeans(n_clusters=5, random_state=42)

In [5]:
# Assign cluster labels to the data
cluster_labels = kmeans.labels_

In [6]:
# Use these cluster labels as target variable for classification
y = cluster_labels

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [8]:
# Train a classification model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000)

In [9]:
# Make predictions
y_pred = classifier.predict(X_test)

In [10]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the classifier: {accuracy:.2f}')

Accuracy of the classifier: 0.97


## Map the old labels to the new ones

In [11]:
# Create a dictionary to map old labels to cluster labels
label_mapping = {}
for old_label, new_label in zip(y_old, y):
    if old_label not in label_mapping:
        label_mapping[old_label] = [new_label]
    else:
        label_mapping[old_label].append(new_label)

# Print the mapping
for old_label, new_labels in label_mapping.items():
    print(f"Old label {old_label} maps to cluster labels {new_labels}")

Old label cardboard maps to cluster labels [4, 0, 1, 1, 4, 4, 2, 4, 2, 4, 4, 4, 1, 4, 4, 0, 0, 2, 2, 0, 4, 1, 4, 1, 4, 1, 1, 1, 0, 4, 2, 0, 4, 1, 1, 0, 4, 4, 2, 2, 2, 0, 0, 0, 0, 4, 4, 0, 4, 1, 2, 4, 0, 0, 2, 4, 1, 0, 0, 4, 2, 4, 4, 4, 0, 0, 0, 1, 1, 0, 0, 4, 4, 4, 4, 2, 0, 4, 4, 1, 4, 0, 0, 0, 4, 1, 4, 1, 4, 0, 4, 1, 1, 4, 1, 0, 2, 1, 1, 2, 4, 0, 4, 2, 1, 4, 4, 0, 4, 0, 0, 0, 0, 4, 1, 0, 0, 1, 4, 0, 1, 4, 1, 2, 0, 0, 4, 4, 0, 4, 0, 4, 0, 1, 1, 2, 4, 0, 4, 4, 1, 1, 4, 1, 1, 1, 0, 0, 0, 0, 4, 1, 0, 0, 4, 4, 0, 0, 0, 0, 2, 2, 4, 1, 0, 4, 4, 1, 0, 4, 0, 0, 4, 1, 4, 2, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 4, 4, 1, 2, 0, 4, 0, 0, 0, 0, 2, 0, 0, 4, 0, 2, 0, 2, 0, 4, 4, 4, 0, 4, 4, 1, 1, 1, 1, 0, 4, 0, 2, 3, 2, 1, 0, 4, 0, 1, 4, 1, 4, 0, 0, 2, 1, 4, 4, 0, 0, 0, 4, 0, 1, 2, 2, 0, 4, 0, 1, 0, 4, 4, 0, 0, 4, 4, 0, 0, 0, 4, 4, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 4, 1, 1, 1, 2, 0, 0, 0, 1, 1, 1, 0, 4, 0, 0, 2, 4, 1, 4, 0, 0, 0, 1, 0, 4, 1, 0, 4, 4, 4, 4, 1, 0, 1, 1, 1, 0, 0, 0, 3, 2, 0, 2, 0, 2,

In [13]:
# Create a dictionary to map cluster labels to old labels
label_mapping = {}
for new_label, old_label in zip(y, y_old):
    if new_label not in label_mapping:
        label_mapping[new_label] = [old_label]
    else:
        label_mapping[new_label].append(old_label)

# Convert the values to sets to remove duplicates
for key, value in label_mapping.items():
    label_mapping[key] = list(set(value))

# Print the mapping
for new_label, old_labels in label_mapping.items():
    print(f"Old label {new_label} maps to cluster labels {old_labels}")

Old label 4 maps to cluster labels ['cardboard', 'foam', 'wooden-shelf', 'plastic', 'wooden-cabinet']
Old label 0 maps to cluster labels ['cardboard', 'foam', 'wooden-shelf', 'plastic', 'wooden-cabinet']
Old label 1 maps to cluster labels ['cardboard', 'metal', 'foam', 'wooden-shelf', 'plastic', 'wooden-cabinet']
Old label 2 maps to cluster labels ['cardboard', 'foam', 'wooden-shelf', 'plastic', 'wooden-cabinet']
Old label 3 maps to cluster labels ['cardboard', 'metal', 'foam', 'wooden-shelf', 'plastic', 'wooden-cabinet']
