In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [7]:
# Load the CSV file into a pandas DataFrame
df_model_a = pd.read_csv('../../data/processed/equipment_shift_risk_10k.csv')

In [8]:
print(df_model_a.info())
print(df_model_a.isnull().sum())
print(df_model_a.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id_record                  10000 non-null  object 
 1   id_alat                    10000 non-null  object 
 2   tanggal_operasi            10000 non-null  object 
 3   shift                      10000 non-null  object 
 4   jam_mulai                  10000 non-null  object 
 5   jam_selesai                10000 non-null  object 
 6   status_operasi             10000 non-null  object 
 7   durasi_jam                 10000 non-null  float64
 8   material_dipindah          7034 non-null   object 
 9   total_muatan_ton           10000 non-null  float64
 10  jumlah_ritase              10000 non-null  int64  
 11  id_operator                10000 non-null  object 
 12  lokasi_kode_final          10000 non-null  object 
 13  tipe_alat                  10000 non-null  obje

In [9]:
df_model_a['tanggal_operasi'] = pd.to_datetime(df_model_a['tanggal_operasi'])

In [10]:
df_model_a['waktu_mulai_operasi'] = df_model_a.apply(
    lambda row: row['tanggal_operasi'].replace(
        hour=int(row['jam_mulai'].split(':')[0]),
        minute=int(row['jam_mulai'].split(':')[1]),
        second=int(row['jam_mulai'].split(':')[2])
    ), 
    axis=1
)

In [11]:
df_model_a['waktu_selesai_operasi'] = df_model_a.apply(
    lambda row: row['tanggal_operasi'].replace(
        hour=int(row['jam_selesai'].split(':')[0]),
        minute=int(row['jam_selesai'].split(':')[1]),
        second=int(row['jam_selesai'].split(':')[2])
    ), 
    axis=1
)

In [12]:
# Fill missing values
df_model_a['material_dipindah'] = df_model_a['material_dipindah'].fillna('Unknown')

In [13]:
print("First 5 rows of the DataFrame after initial preprocessing:")
print(df_model_a.head())
print("\nDataFrame Info after initial preprocessing:")
print(df_model_a.info())

First 5 rows of the DataFrame after initial preprocessing:
        id_record   id_alat tanggal_operasi    shift jam_mulai jam_selesai  \
0  OPRREC_0000001  ALAT_047      2025-07-01  Shift 2  14:00:00    17:43:00   
1  OPRREC_0000002  ALAT_047      2025-07-01  Shift 1  06:00:00    11:19:00   
2  OPRREC_0000003  ALAT_082      2025-07-01  Shift 3  22:00:00    23:50:00   
3  OPRREC_0000004  ALAT_082      2025-07-01  Shift 1  06:00:00    11:50:00   
4  OPRREC_0000005  ALAT_037      2025-07-01  Shift 2  14:00:00    18:42:00   

  status_operasi  durasi_jam material_dipindah  total_muatan_ton  ...  \
0        Standby        3.72        Overburden               0.0  ...   
1     Beroperasi        5.33        Overburden            1653.1  ...   
2        Standby        1.84        Overburden               0.0  ...   
3     Beroperasi        5.84        Overburden             819.8  ...   
4     Beroperasi        4.71           Unknown               0.0  ...   

   suhu_min_c suhu_max_c kelembab

In [14]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler

# 1. Sort the DataFrame by 'waktu_mulai_operasi'
df_model_a_sorted = df_model_a.sort_values(by='waktu_mulai_operasi').reset_index(drop=True)

# 2. Calculate the split points for training (80%), validation (10%), and test (10%) sets
total_rows = len(df_model_a_sorted)
train_size = int(0.8 * total_rows)
val_size = int(0.1 * total_rows)

df_train_a = df_model_a_sorted.iloc[:train_size].copy()
df_val_a = df_model_a_sorted.iloc[train_size:train_size + val_size].copy()
df_test_a = df_model_a_sorted.iloc[train_size + val_size:].copy()

# 4. Separate features (X) and target (y) for each split
target_column_a = 'label_breakdown_next_24h'

# Columns to exclude from features
exclude_cols_a = [
    'id_record', 'id_alat', 'id_operator', 'tanggal_operasi',
    'jam_mulai', 'jam_selesai', 'waktu_mulai_operasi', 'waktu_selesai_operasi',
    'label_breakdown_shift_ini', target_column_a
]

X_train_a = df_train_a.drop(columns=exclude_cols_a)
y_train_a = df_train_a[target_column_a]

X_val_a = df_val_a.drop(columns=exclude_cols_a)
y_val_a = df_val_a[target_column_a]

X_test_a = df_test_a.drop(columns=exclude_cols_a)
y_test_a = df_test_a[target_column_a]

# 5. Identify all remaining categorical columns in X_train_a
categorical_cols_time_split = X_train_a.select_dtypes(include='object').columns.tolist()

# 6. Apply one-hot encoding to these categorical columns across all three sets
# Combine all dataframes for consistent one-hot encoding
all_X_a = pd.concat([X_train_a, X_val_a, X_test_a], ignore_index=True)
all_X_a_encoded = pd.get_dummies(all_X_a, columns=categorical_cols_time_split, drop_first=True, dtype=int)

# Split back into train, val, test after encoding
X_train_a_encoded = all_X_a_encoded.iloc[:len(X_train_a)]
X_val_a_encoded = all_X_a_encoded.iloc[len(X_train_a) : len(X_train_a) + len(X_val_a)]
X_test_a_encoded = all_X_a_encoded.iloc[len(X_train_a) + len(X_val_a) :]

# 7. Identify numerical features in the one-hot encoded X_train_a
numerical_cols_time_split = X_train_a_encoded.select_dtypes(include=['float64', 'int64', 'int32']).columns.tolist()

# 8. Initialize a StandardScaler and fit it only on the numerical features
scaler_time_split = StandardScaler()
scaler_time_split.fit(X_train_a_encoded[numerical_cols_time_split])

# 9. Transform the numerical features in all sets
X_train_a_encoded[numerical_cols_time_split] = scaler_time_split.transform(X_train_a_encoded[numerical_cols_time_split])
X_val_a_encoded[numerical_cols_time_split] = scaler_time_split.transform(X_val_a_encoded[numerical_cols_time_split])
X_test_a_encoded[numerical_cols_time_split] = scaler_time_split.transform(X_test_a_encoded[numerical_cols_time_split])

print("Time-based split and preprocessing for Model A completed.")
print(f"X_train_a_encoded shape: {X_train_a_encoded.shape}")
print(f"y_train_a shape: {y_train_a.shape}")
print(f"X_val_a_encoded shape: {X_val_a_encoded.shape}")
print(f"y_val_a shape: {y_val_a.shape}")
print(f"X_test_a_encoded shape: {X_test_a_encoded.shape}")
print(f"y_test_a shape: {y_test_a.shape}")

Time-based split and preprocessing for Model A completed.
X_train_a_encoded shape: (8000, 58)
y_train_a shape: (8000,)
X_val_a_encoded shape: (1000, 58)
y_val_a shape: (1000,)
X_test_a_encoded shape: (1000, 58)
y_test_a shape: (1000,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_a_encoded[numerical_cols_time_split] = scaler_time_split.transform(X_train_a_encoded[numerical_cols_time_split])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_a_encoded[numerical_cols_time_split] = scaler_time_split.transform(X_val_a_encoded[numerical_cols_time_split])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

In [15]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# --- Retrain Model A with Time-based Split Data ---
# Initialize Logistic Regression model with specified parameters
# Ensure class_weight='balanced' to handle potential class imbalance
model_a_time_split = LogisticRegression(
    random_state=42, 
    solver='liblinear', 
    class_weight='balanced'
)

# Train the model using the encoded training data from the time-based split
model_a_time_split.fit(X_train_a_encoded, y_train_a)

# Evaluate the retrained model on the encoded validation set
y_pred_val_a = model_a_time_split.predict(X_val_a_encoded)
y_proba_val_a = model_a_time_split.predict_proba(X_val_a_encoded)[:, 1]

print("--- Model A (Time-based Split): Logistic Regression Performance on Validation Set ---")
print("Classification Report:")
print(classification_report(y_val_a, y_pred_val_a))
print(f"ROC AUC Score: {roc_auc_score(y_val_a, y_proba_val_a):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_val_a, y_pred_val_a))

--- Model A (Time-based Split): Logistic Regression Performance on Validation Set ---
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       902
           1       0.37      0.45      0.41        98

    accuracy                           0.87      1000
   macro avg       0.65      0.68      0.67      1000
weighted avg       0.88      0.87      0.88      1000

ROC AUC Score: 0.7226
Confusion Matrix:
[[827  75]
 [ 54  44]]
