In [16]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch as t

from sklearn.manifold import TSNE
import plotly.graph_objects as go


In [3]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

Preprocess train dataset

In [4]:
not_sensors = ['physical_part_type', 'message_timestamp', 'weekday', 'shift', 'physical_part_id']
sensors = [col for col in train_df.columns.tolist() if col not in not_sensors]

In [5]:
train_df = train_df.dropna(axis=1, how='all')
train_df = train_df.drop(columns=not_sensors)

train_df = train_df.fillna(train_df.mean())

In [6]:
assert not train_df.isna().any().any()

In [10]:
pos_label = train_df[train_df["label"] == 1]
neg_label = train_df[train_df["label"] == 0]

min_count = min(len(pos_label), len(neg_label))

# keep only a few positive labels for vizualization
pos_label_samples = pos_label.sample(min_count, random_state=42)

balanced_train_df = pd.concat([pos_label_samples, neg_label])

balanced_train_df.shape[0], balanced_train_df.index

(4094,
 Index([35892, 34229, 26743, 28719, 23717, 15888, 23387, 16729, 27642, 18617,
        ...
        40155, 40159, 40171, 40189, 40203, 40209, 40241, 40269, 40291, 40304],
       dtype='int64', length=4094))

Fit the TSNE model on training data for 2D visualization

In [19]:
n_components = 2
features = balanced_train_df.drop(columns=["label"]).values

# Apply t-SNE to project the features to 2D
tsne = TSNE(n_components=n_components, random_state=42)

train_embedding = tsne.fit_transform(features)

Plot the resulting class representation

In [21]:
encoded_pos_features = train_embedding[:min_count]
encoded_neg_features = train_embedding[min_count:]

fig = go.Figure()

# Add the first group to the plot
fig.add_trace(go.Scatter(
    x=encoded_pos_features[:, 0], 
    y=encoded_pos_features[:, 1], 
    mode='markers', 
    name='OK',  # Legend label
    marker=dict(color='blue')  # Unique color
))

# Add the second group to the plot
fig.add_trace(go.Scatter(
    x=encoded_neg_features[:, 0], 
    y=encoded_neg_features[:, 1], 
    mode='markers', 
    name='NOK',  # Legend label
    marker=dict(color='red')  # Unique color
))

# Customize the layout
fig.update_layout(
    title='2D Scatter Plot with Two Groups',
    xaxis_title='Feature 1',
    yaxis_title='Feature 2',
    legend_title='Groups',
    template='plotly'
)

# Show the plot
fig.show()

Fit for 2D visualization

In [22]:
n_components = 3
# Apply t-SNE to project the features to 2D
tsne = TSNE(n_components=n_components, random_state=42)

train_embedding_3D = tsne.fit_transform(features)

In [24]:
encoded_pos_features = train_embedding_3D[:min_count]
encoded_neg_features = train_embedding_3D[min_count:]

In [26]:
fig = go.Figure()

# Add group 1
fig.add_trace(go.Scatter3d(
    x=encoded_pos_features[:, 0], y=encoded_pos_features[:, 1], z=encoded_pos_features[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue'),
    name='OK'
))

# Add group 2
fig.add_trace(go.Scatter3d(
    x=encoded_neg_features[:, 0], y=encoded_neg_features[:, 1], z=encoded_neg_features[:, 2],
    mode='markers',
    marker=dict(size=5, color='red'),
    name='NOK'
))

# Update layout
fig.update_layout(
    title="Interactive 3D Scatter Plot with Plotly",
    scene=dict(
        xaxis_title='X-axis',
        yaxis_title='Y-axis',
        zaxis_title='Z-axis'
    )
)

fig.show()

# Plot the t-SNE representation calculated only for the important features, preprocessed by SmartNormalizer

In [7]:
from app.data_processing import *

In [9]:
df = pd.read_csv("train.csv")
df = add_weather_data(df, weather_data_path="kaggle/weather.csv").drop(columns=["message_timestamp", "physical_part_id"])
target = df[target_col] == ok_val
df = df.drop(columns=[target_col])
df = df[important_columns]
df_train = df.sample(frac=0.8)
df_valid = df.drop(df_train.index)

smart_normalizer = SmartNormalizerDF(two_col=True)
smart_normalizer.fit(df_train)
df_train = smart_normalizer.transform(df_train)
target_train = target[df_train.index]
df_valid = smart_normalizer.transform(df_valid)
target_valid = target[df_valid.index]

In [13]:
df_train.head()

Unnamed: 0,s8_sensor100_millimeter_step1,shift,weekday,s4_sensor16_minuten (zeit)_step1,s5_sensor0_sekunden (zeit)_step1,s10_sensor2_gramm_step1,s3_sensor0_km_step1,s7_sensor26_mikroohm_step1,s8_sensor32_millimeter_step1,s10_sensor0_minuten (zeit)_step1,...,s5_sensor0_sekunden (zeit)_step1_ok,s10_sensor2_gramm_step1_ok,s3_sensor0_km_step1_ok,s7_sensor26_mikroohm_step1_ok,s8_sensor32_millimeter_step1_ok,s10_sensor0_minuten (zeit)_step1_ok,temperature_2m_ok,relative_humidity_2m_ok,precipitation_ok,pressure_msl_ok
17059,1.770527,2.0,3.0,0.216904,0.681819,0.362241,-1.559122,0.191052,-0.311987,1.491217,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
26839,0.478865,2.0,4.0,-0.781781,-0.470292,-1.275817,-0.666564,0.559592,0.843427,-0.544847,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6189,0.449323,1.0,4.0,0.13971,0.032027,0.375793,-0.76471,-0.335333,0.041156,1.260387,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3031,0.335333,2.0,0.0,1.120205,1.355888,0.900834,0.276575,-0.152506,0.423792,-0.178175,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
35252,0.144826,0.0,4.0,-1.008673,-0.86393,-0.53022,0.594484,-0.242903,0.074218,-5.199338,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
target_train.head()

17059    True
26839    True
6189     True
3031     True
35252    True
Name: label, dtype: bool

In [17]:
n_components = 2
features = df_train.values

# Apply t-SNE to project the features to 2D
tsne = TSNE(n_components=n_components, random_state=42)

train_embedding = tsne.fit_transform(features)

In [28]:
target_train = target_train.reset_index(drop=True)
target_train.head()

0    True
1    True
2    True
3    True
4    True
Name: label, dtype: bool

In [32]:
encoded_pos_features = train_embedding[target_train, :]
encoded_neg_features = train_embedding[~target_train,:]

fig = go.Figure()

# Add the first group to the plot
fig.add_trace(go.Scatter(
    x=encoded_pos_features[:, 0], 
    y=encoded_pos_features[:, 1], 
    mode='markers', 
    name='OK',  # Legend label
    marker=dict(color='blue')  # Unique color
))

# Add the second group to the plot
fig.add_trace(go.Scatter(
    x=encoded_neg_features[:, 0], 
    y=encoded_neg_features[:, 1], 
    mode='markers', 
    name='NOK',  # Legend label
    marker=dict(color='red')  # Unique color
))

# Customize the layout
fig.update_layout(
    title='2D Scatter Plot with Two Groups',
    xaxis_title='Feature 1',
    yaxis_title='Feature 2',
    legend_title='Groups',
    template='plotly'
)

# Show the plot
fig.show()

In [33]:
n_components = 3
# Apply t-SNE to project the features to 3D
tsne = TSNE(n_components=n_components, random_state=42)

train_embedding = tsne.fit_transform(features)

In [34]:
encoded_pos_features = train_embedding[target_train, :]
encoded_neg_features = train_embedding[~target_train, :]

In [35]:
fig = go.Figure()

# Add group 1
fig.add_trace(go.Scatter3d(
    x=encoded_pos_features[:, 0], y=encoded_pos_features[:, 1], z=encoded_pos_features[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue'),
    name='OK'
))

# Add group 2
fig.add_trace(go.Scatter3d(
    x=encoded_neg_features[:, 0], y=encoded_neg_features[:, 1], z=encoded_neg_features[:, 2],
    mode='markers',
    marker=dict(size=5, color='red'),
    name='NOK'
))

# Update layout
fig.update_layout(
    title="Interactive 3D Scatter Plot with Plotly",
    scene=dict(
        xaxis_title='X-axis',
        yaxis_title='Y-axis',
        zaxis_title='Z-axis'
    )
)

fig.show()