In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# Load the dataset
data = pd.read_csv('data/train.csv')

In [2]:
# Specify the target variables in the refined order
target_vars = ['K_Scatch', 'Stains', 'Other_Faults', 'Bumps', 'Z_Scratch', 'Pastry', 'Dirtiness']

# Separate the features and target variables
features = data.drop(columns=target_vars)
target = data[target_vars]

# Define the top features for each target variable based on feature importance
top_features = {
    'K_Scatch': ['Outside_X_Index', 'Log_X_Index', 'Other_Faults', 'Steel_Plate_Thickness', 'Bumps'],
    'Stains': ['Pixels_Areas', 'Other_Faults', 'LogOfAreas', 'Steel_Plate_Thickness', 'Bumps'],
    'Other_Faults': ['K_Scatch', 'Bumps', 'Z_Scratch', 'Pastry', 'Stains'],
    'Bumps': ['K_Scatch', 'Other_Faults', 'Z_Scratch', 'Pastry', 'Dirtiness'],
    'Z_Scratch': ['Bumps', 'Other_Faults', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness'],
    'Pastry': ['Length_of_Conveyer', 'Bumps', 'Other_Faults', 'Dirtiness', 'Orientation_Index'],
    'Dirtiness': ['Bumps', 'Z_Scratch', 'Pastry', 'Orientation_Index', 'K_Scatch']
}


In [3]:
# Perform feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_features_df, target, test_size=0.2, random_state=42)

# Train models sequentially based on the refined order
rf_models = {}
train_predictions = pd.DataFrame()
test_predictions = pd.DataFrame()

for target_var in target_vars:
    # Select the top features for the current target variable
    selected_features = top_features[target_var]
    
    # Add previous target variable predictions as features
    train_features = X_train.copy()
    test_features = X_test.copy()
    
    for feature in selected_features:
        if feature in train_predictions.columns:
            train_features[feature] = train_predictions[feature]
            test_features[feature] = test_predictions[feature]
    
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(train_features, y_train[target_var])
    rf_models[target_var] = rf_model
    
    # Make predictions and add them to the predictions DataFrames
    train_predictions[target_var] = rf_model.predict(train_features)
    test_predictions[target_var] = rf_model.predict(test_features)

In [8]:
# Evaluate the model's performance using average AUC
auc_scores = []
for target_var in target_vars:
    auc = roc_auc_score(y_test[target_var], test_predictions[target_var])
    auc_scores.append(auc)
    print(f"AUC for {target_var}: {auc:.4f}")

print(f"\nAverage AUC: {np.mean(auc_scores):.4f}")

AUC for K_Scatch: 0.9825
AUC for Stains: 0.9775
AUC for Other_Faults: 0.6786
AUC for Bumps: 0.7877
AUC for Z_Scratch: 0.9303
AUC for Pastry: 0.8446
AUC for Dirtiness: 0.8278

Average AUC: 0.8613


In [9]:
# Load the test dataset
test_data = pd.read_csv('data/test.csv')

# Select the same features as in the training data
test_features = test_data[features.columns]

# Perform feature scaling on the test data
scaled_test_features = scaler.transform(test_features)
scaled_test_features_df = pd.DataFrame(scaled_test_features, columns=test_features.columns)

# Make predictions on the test data
test_predictions = pd.DataFrame()

for target_var in target_vars:
    # Select the top features for the current target variable
    selected_features = top_features[target_var]
    
    # Add previous target variable predictions as features
    test_features = scaled_test_features_df.copy()
    
    for feature in selected_features:
        if feature in test_predictions.columns:
            test_features[feature] = test_predictions[feature]
    
    test_predictions[target_var] = rf_models[target_var].predict(test_features)

# Combine the test predictions with the 'id' column
output_data = pd.concat([test_data['id'], test_predictions], axis=1)

# Save the output data to 'output.csv'
output_data.to_csv('output.csv', index=False)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- id
