In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

## TASK 1: Predict Product Failures:

In [None]:
# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Combine for consistent encoding and imputation
train_df = train.copy()
test_df = test.copy()
train_df['is_test_set'] = 0
test_df['is_test_set'] = 1
combined = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# Categorical Encoding for attributes
le = LabelEncoder()
for col in ['attribute_0', 'attribute_1', 'product_code']:
    combined[col] = le.fit_transform(combined[col].astype(str))

# Missing Value Imputation (using median)
imputer = SimpleImputer(strategy='median')
cols_to_impute = [col for col in combined.columns if 'measurement' in col or col == 'loading']
combined[cols_to_impute] = imputer.fit_transform(combined[cols_to_impute])

# Split back into processed train and test sets for Task 1
train_p = combined[combined['is_test_set'] == 0].drop(columns=['is_test_set'])
test_p = combined[combined['is_test_set'] == 1].drop(columns=['is_test_set', 'failure'])

print("Data preprocessing completed.")

Data preprocessing completed.


In [None]:
print("Displaying the first 5 rows of the processed training data:")
display(train_p.head())

Displaying the first 5 rows of the processed training data:


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,0,80.1,1,3,9,5,7.0,8.0,4.0,...,10.672,15.859,17.594,15.193,15.029,16.081,13.034,14.684,764.1,0.0
1,1,0,84.89,1,3,9,5,14.0,3.0,3.0,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0.0
2,2,0,82.43,1,3,9,5,12.0,1.0,5.0,...,12.715,15.607,19.053,13.798,16.711,18.631,14.094,17.946,663.376,0.0
3,3,0,101.07,1,3,9,5,13.0,2.0,6.0,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0.0
4,4,0,188.06,1,3,9,5,9.0,2.0,8.0,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0.0


In [None]:
# Feature Selection
features = [col for col in train_p.columns if col not in ['id', 'failure', 'product_code']]
X = train_p[features]
y = train_p['failure']

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Model for Failure Prediction
model_failure = LogisticRegression(max_iter=1000, random_state=42)
model_failure.fit(X_scaled, y)

print("Task 1: Model training completed.")

Task 1: Model training completed.


In [None]:
# Scaling test features
X_test_scaled = scaler.transform(test_p[features])

# Generate probabilities
test_p['failure_probability'] = model_failure.predict_proba(X_test_scaled)[:, 1]

print("Displaying failure predictions for the first 5 test IDs:")
display(test_p[['id', 'failure_probability']].head())

Displaying failure predictions for the first 5 test IDs:


Unnamed: 0,id,failure_probability
26570,26570,0.198649
26571,26571,0.148467
26572,26572,0.174913
26573,26573,0.177457
26574,26574,0.327075


## TASK 2: Data Drift Evaluation:

In [None]:
print("Displaying the first 5 rows of the original test data:")
display(test.head())

Displaying the first 5 rows of the original test data:


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,26570,F,119.57,material_5,material_6,6,4,6,9,6,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,26571,F,113.51,material_5,material_6,6,4,11,8,0,...,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
2,26572,F,112.16,material_5,material_6,6,4,8,12,4,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,26573,F,112.72,material_5,material_6,6,4,8,11,10,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,26574,F,208.0,material_5,material_6,6,4,14,16,8,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [None]:
# Prepare the combined dataframe for drift analysis
df_train_drift = train.drop(columns=['failure', 'id'])
df_test_drift = test.drop(columns=['id'])

# Add target column: 0 for train, 1 for test
df_train_drift['is_test'] = 0
df_test_drift['is_test'] = 1

# Combine datasets
combined_drift = pd.concat([df_train_drift, df_test_drift], axis=0).reset_index(drop=True)

# Minimal encoding for the drift model
for col in ['product_code', 'attribute_0', 'attribute_1']:
    combined_drift[col] = LabelEncoder().fit_transform(combined_drift[col].astype(str))

# Simple imputation for any remaining NaNs
combined_drift = combined_drift.fillna(combined_drift.median())

print("Data preprocessing for drift test completed.")

Data preprocessing for drift test completed.


In [None]:
# Prepare features and target for drift model
X_drift = combined_drift.drop(columns=['is_test'])
y_drift = combined_drift['is_test']

# Split for validation
X_train_d, X_val_d, y_train_d, y_val_d = train_test_split(X_drift, y_drift, test_size=0.2, random_state=42)

# Train a classifier to distinguish train vs test
drift_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
drift_model.fit(X_train_d, y_train_d)

# Evaluate predictability (Drift)
drift_probs = drift_model.predict_proba(X_val_d)[:, 1]
auc_score = roc_auc_score(y_val_d, drift_probs)

print(f"Drift Model ROC-AUC Score: {auc_score:.4f}")

if auc_score > 0.7:
    print("RESULT: Significant data drift detected. The model can easily distinguish between train and test data.")
else:
    print("RESULT: No significant data drift detected. The datasets are homogeneous.")

Drift Model ROC-AUC Score: 1.0000
RESULT: Significant data drift detected. The model can easily distinguish between train and test data.


## Conclusión:
Al ejecutar el modelo de detección de drift, observamos un ROC-AUC de 1.0000. Este resultado, aunque perfecto matemáticamente, revela un problema de homogeneidad: el modelo puede distinguir ambos conjuntos con total certeza.

La razón principal es el campo product_code: el dataset utiliza códigos de producto distintos para entrenamiento (ej. A, B, C) y para prueba (ej. F, G, H). Al ser categorías mutuamente excluyentes, el modelo simplemente memoriza qué código pertenece a qué grupo. Para un análisis de drift más útil sobre las mediciones físicas, podríamos repetir este test eliminando la columna product_code.