# EDA PART 1

In [1]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df =pd.read_csv(r"../Data/datasets/samoilovmikhail/simulated-refrigerator-fault-diagnosis-dataset/versions/1/fridge_fault_timeseries_dataset.csv")

In [3]:
df.head

<bound method NDFrame.head of              T_amb     T_set      T_cab  T_evap_sat  T_cond_sat  P_suc_bar  \
0        22.187599  2.627537  22.111991   18.162151   30.019587   4.964200   
1        22.372591  2.627537  21.962171   18.119603   30.427325   4.905156   
2        22.328321  2.627537  21.838008   17.999535   30.270884   4.875609   
3        22.614706  2.627537  21.775435   18.109126   30.685874   4.895076   
4        22.428984  2.627537  21.661756   17.692930   30.322264   4.876375   
...            ...       ...        ...         ...         ...        ...   
1871995  22.660817  2.396401  24.789994   22.103833   30.903144   5.599258   
1871996  22.918161  2.396401  24.813227   22.063658   31.152460   5.630031   
1871997  22.919958  2.396401  24.839240   22.367267   31.272297   5.679764   
1871998  22.368082  2.396401  24.844353   22.540697   30.453003   5.700113   
1871999  22.840934  2.396401  24.894025   22.228167   31.190394   5.727502   

         P_dis_bar  N_comp_Hz    

In [7]:
print(df['fault'].value_counts())

fault
NORMAL                       144000
COND_FOUL_MILD               144000
COND_FOUL_SEVERE             144000
EVAP_FAN_DEG                 144000
EVAP_FAN_FAIL                144000
UNDERCHARGE_MILD             144000
UNDERCHARGE_SEVERE           144000
OVERCHARGE                   144000
SENSOR_DRIFT_PLUS            144000
SENSOR_DRIFT_MINUS           144000
COMP_INEFFICIENCY            144000
NON_CONDENSABLES             144000
UNDERCHARGE_AND_COND_FOUL    144000
Name: count, dtype: int64


In [None]:
df.isna().mean().sort_values(ascending=False)


In [None]:
df.duplicated().sum()


In [None]:
print(df['fault'].value_counts(normalize=True))


In [None]:
df.groupby('run_id')['time_min'].agg(['min','max','count'])



In [None]:
fig = px.histogram(df, x='fault', title='Distribution of Fault Types')
fig.show()

In [None]:
fig = px.bar(
    df['fault'].value_counts().reset_index(),
    x='count',
    y='fault',
    title='Distribution of Fault Types (Row-Level)',
    labels={'index': 'Fault Type', 'fault': 'Row Count'}
)
fig.show()


In [None]:
run_faults = (
    df.groupby('run_id')['fault_id']
      .first()
      .value_counts()
      .reset_index()
      .rename(columns={'index':'fault_id','fault_id':'num_runs'})
)



In [None]:
run_faults

In [None]:
run_lengths = (
    df.groupby('run_id')['time_min']
    .agg(['min','max','count'])
    .reset_index()
)

fig = px.histogram(
    run_lengths,
    x='count',
    nbins=20,
    title='Distribution of Time Steps per Run'
)
fig.show()


In [None]:
temp_cols = [
    'T_amb','T_set','T_cab','T_cab_meas',
    'T_evap_sat','T_cond_sat'
]

for col in temp_cols:
    fig = px.histogram(
        df, x=col, nbins=100,
        title=f'Distribution of {col}'
    )
    fig.show()


In [None]:
for col in ['P_suc_bar','P_dis_bar']:
    fig = px.histogram(
        df, x=col, nbins=100,
        title=f'Distribution of {col}'
    )
    fig.show()


In [None]:
perf_cols = ['N_comp_Hz','P_comp_W','Q_evap_W','COP']

for col in perf_cols:
    fig = px.histogram(
        df, x=col, nbins=100,
        title=f'Distribution of {col}'
    )
    fig.show()


In [None]:
for col in ['door_open', 'defrost_on']:
    vc = (
        df[col]
        .value_counts()
        .reset_index()
        .rename(columns={'index': col, 'count': 'frequency'})
    )

    fig = px.bar(
        vc,
        x=col,
        y='frequency',
        title=f'{col} Frequency'
    )
    fig.show()


In [None]:
fig = px.histogram(
    df, x='frost_level', nbins=50,
    title='Frost Level Distribution'
)
fig.show()


In [None]:
healthy = df[df['fault'] == 'NORMAL']

corr_cols = [
    'T_amb','T_cab_meas','T_evap_sat','T_cond_sat',
    'P_suc_bar','P_dis_bar','N_comp_Hz',
    'P_comp_W','Q_evap_W','COP'
]

corr = healthy[corr_cols].corr()

fig = px.imshow(
    corr,
    title='Correlation Heatmap (Healthy Runs Only)',
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1
)
fig.show()


# EDA PART 2 TEMPORAL & FAULT BEHAVIOR
`Goal: Understand how faults manifest over time and separate in feature space`

In [None]:
healthy_run = df[df['fault']=='NORMAL']['run_id'].iloc[0]
sub = df[df['run_id'] == healthy_run]

fig = px.line(
    sub, x='time_min', y='COP',
    title='COP Over Time — Healthy Run'
)
fig.show()

faulty_run = df[df['fault']!='NORMAL']['run_id'].iloc[0]
sub = df[df['run_id'] == faulty_run]

fig = px.line(
    sub, x='time_min', y='COP',
    title='COP Over Time — Faulty Run'
)
fig.show()

In [None]:
healthy_run = df[df['fault']=='NORMAL']['run_id'].iloc[0]
sub = df[df['run_id'] == healthy_run]

fig = px.line(
    sub, x='time_min', y='P_dis_bar',
    title='P_dis_bar Over Time — Healthy Run'
)
fig.show()


In [None]:
# 
healthy_run = df[df['fault']=='NORMAL']['run_id'].iloc[0]
sub = df[df['run_id'] == healthy_run]

fig = px.line(
    sub, x='time_min', y='N_comp_Hz',
    title='N_comp_Hz Over Time — Healthy Run'
)
fig.show()


In [None]:
# 
healthy_run = df[df['fault']=='NORMAL']['run_id'].iloc[0]
sub = df[df['run_id'] == healthy_run]

fig = px.line(
    sub, x='time_min', y='T_cab_meas',
    title='T_cab_meas Over Time — Healthy Run'
)
fig.show()

In [None]:
df['health'] = df['fault'].apply(lambda x: 'Healthy' if x=='NORMAL' else 'Faulty')

fig = px.violin(
    df, x='health', y='COP',
    box=True, points=False,
    title='COP: Healthy vs Faulty'
)
fig.show()


In [None]:
df['health'] = df['fault'].apply(lambda x: 'Healthy' if x=='NORMAL' else 'Faulty')

fig = px.violin(
    df, x='health', y='P_dis_bar',
    box=True, points=False,
    title='P_dis_bar: Healthy vs Faulty'
)
fig.show()


In [None]:
df['health'] = df['fault'].apply(lambda x: 'Healthy' if x=='NORMAL' else 'Faulty')

fig = px.violin(
    df, x='health', y='P_comp_W',
    box=True, points=False,
    title='P_comp_W: Healthy vs Faulty'
)
fig.show()


In [None]:
fig = px.box(
    df, x='fault', y='COP',
    title='COP Distribution per Fault Type'
)
fig.show()

fig = px.box(
    df, x='fault', y='P_dis_bar',
    title='P_dis_bar Distribution per Fault Type'
)
fig.show()

fig = px.box(
    df, x='fault', y='N_comp_Hz',
    title='N_comp_Hz Distribution per Fault Type'
)  
fig.show()


In [None]:
fig = px.scatter(
    df.sample(100000),
    x='P_dis_bar',
    y='COP',
    color='fault',
    title='Fault Separability: COP vs Discharge Pressure',
    opacity=0.5
)
fig.show()


In [None]:
sub = df[df['run_id'] == faulty_run].copy()
sub['COP_roll_std'] = sub['COP'].rolling(30).std()

fig = px.line(
    sub, x='time_min', y='COP_roll_std',
    title='Rolling COP Variability (Faulty Run)'
)
fig.show()


# Summary 
The exploratory data analysis reveals that faults in the dataset represent persistent operating regimes rather than time-localized failure events. Healthy system behavior exhibits strong thermodynamic consistency across temperatures, pressures, and compressor power, validating the realism of the simulation. Fault conditions primarily manifest as reduced efficiency, increased variability, and altered inter-variable relationships rather than extreme sensor values. Operational events such as defrost cycles and compressor off periods introduce transient COP drops in both healthy and faulty runs, highlighting the need for sequence-based modeling and contextual feature handling. Overall, the dataset is well-suited for multivariate time-series classification and predictive maintenance modeling using machine learning and deep learning approaches.