In [39]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier



In [40]:
data = pd.read_csv('data/train.csv')
pre_predictions = pd.read_csv('output.csv')

target_vars = ['K_Scatch', 'Stains', 'Z_Scratch', 'Pastry', 'Dirtiness', 'Bumps', 'Other_Faults']


In [41]:
def gen_features(data):
    # Combining X_Minimum and X_Maximum
    data['X_Range'] = data['X_Maximum'] - data['X_Minimum']
    data['X_Midpoint'] = (data['X_Maximum'] + data['X_Minimum']) / 2

    # Combining Y_Minimum and Y_Maximum
    data['Y_Range'] = data['Y_Maximum'] - data['Y_Minimum']
    data['Y_Midpoint'] = (data['Y_Maximum'] + data['Y_Minimum']) / 2

    # Combining Minimum_of_Luminosity and Maximum_of_Luminosity
    data['Luminosity_Range'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']
    data['Luminosity_Average'] = (data['Maximum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2

    # Combining X_Perimeter and Y_Perimeter
    data['Total_Perimeter'] = data['X_Perimeter'] + data['Y_Perimeter']
    data['Perimeter_Ratio'] = data['X_Perimeter'] / data['Y_Perimeter']

    # Interaction terms
    data['Outside_X_Index_Log_X_Index'] = data['Outside_X_Index'] * data['Log_X_Index']

    # Ratio features
    data['Pixels_Areas_Sum_of_Luminosity_Ratio'] = data['Pixels_Areas'] / data['Sum_of_Luminosity']

    # Logarithmic transformations
    data['Log_Pixels_Areas'] = np.log1p(data['Pixels_Areas'])
    data['Log_Sum_of_Luminosity'] = np.log1p(data['Sum_of_Luminosity'])

    # Categorical feature encoding
    if 'TypeOfSteel_A300' in data.columns and 'TypeOfSteel_A400' in data.columns:
        data = pd.get_dummies(data, columns=['TypeOfSteel_A300', 'TypeOfSteel_A400'])

    # Binning or discretization
    data['Steel_Plate_Thickness_Bin'] = pd.cut(data['Steel_Plate_Thickness'], bins=[0, 50, 100, float('inf')], labels=['Low', 'Medium', 'High'])

    # Label encoding for 'Steel_Plate_Thickness_Bin'
    label_encoder = LabelEncoder()
    data['Steel_Plate_Thickness_Bin'] = label_encoder.fit_transform(data['Steel_Plate_Thickness_Bin'])

    return data

data = gen_features(data)

columns_to_encode = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

existing_columns = [col for col in columns_to_encode if col in data.columns]

if existing_columns:

    data = pd.get_dummies(data, columns=existing_columns)

# Separate the features and target variables

features = data.drop(columns=target_vars)

target = data[target_vars]

# Define the top features for each target variable based on feature importance

top_features = {

    'K_Scatch': ['Outside_X_Index', 'X_Range', 'Log_X_Index', 'Steel_Plate_Thickness', 'Outside_X_Index_Log_X_Index'],

    'Stains': ['Log_Pixels_Areas', 'LogOfAreas', 'Pixels_Areas', 'Steel_Plate_Thickness', 'SigmoidOfAreas'],

    'Z_Scratch': ['Length_of_Conveyer', 'Steel_Plate_Thickness_Bin', 'Steel_Plate_Thickness', 'TypeOfSteel_A300_0', 'TypeOfSteel_A300_1', 'Bumps', 'Pastry'],

    'Pastry': ['Length_of_Conveyer', 'Orientation_Index', 'Edges_Y_Index', 'Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'Bumps', 'Dirtiness', 'K_Scatch', 'Z_Scratch', 'Other_Faults'],

    'Dirtiness': ['Orientation_Index', 'Edges_Index', 'Steel_Plate_Thickness', 'Luminosity_Index', 'Length_of_Conveyer', 'Bumps', 'Z_Scratch', 'Pastry', 'K_Scatch', 'Other_Faults'],

    'Bumps': ['Luminosity_Index', 'Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'K_Scatch', 'Z_Scratch', 'Pastry', 'Dirtiness', 'Stains', 'Other_Faults'],

    'Other_Faults': ['Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'Edges_Index', 'K_Scatch', 'Bumps', 'Z_Scratch', 'Pastry', 'Stains', 'Dirtiness']

}

# Combine all the top features into a single list, excluding target variables
all_top_features = list(set(feature for feature_list in top_features.values() for feature in feature_list if feature in features.columns))
# Select the top features from the dataset
selected_features = features[all_top_features]

# Perform feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(selected_features)
scaled_features_df = pd.DataFrame(scaled_features, columns=selected_features.columns)

In [42]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_features_df, target, test_size=0.2, random_state=42)

# Create an XGBoost classifier
xgb_classifier = XGBClassifier(n_estimators=100, objective='binary:logistic', eval_metric='auc', tree_method='gpu_hist', random_state=42, device="cuda")

# Train the XGBoost classifier
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_proba = xgb_classifier.predict_proba(X_test)

# Evaluate the model's performance using average AUC
auc_scores = []
for i, target_var in enumerate(target_vars):
    auc = roc_auc_score(y_test[target_var], y_pred_proba[:, i])
    auc_scores.append(auc)
    print(f"AUC for {target_var}: {auc:.4f}")

print(f"\nAverage AUC: {np.mean(auc_scores):.4f}")


    E.g. tree_method = "hist", device = "cuda"



AUC for K_Scatch: 0.9856
AUC for Stains: 0.9893
AUC for Z_Scratch: 0.9518
AUC for Pastry: 0.8500
AUC for Dirtiness: 0.8848
AUC for Bumps: 0.7886
AUC for Other_Faults: 0.6674

Average AUC: 0.8739



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [43]:
# Separate the features and target variables
#X = data.drop(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
#y_other_faults = data['Other_Faults']

In [44]:
# Create an Isolation Forest model for anomaly detection
iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

# Fit the Isolation Forest model on the test data
iso_forest.fit(X_test)

# Predict anomalies on the test data
anomaly_scores = iso_forest.decision_function(X_test)
anomaly_predictions = iso_forest.predict(X_test)

# Calculate the AUC score for the anomaly detection model
if 'Other_Faults' in target_vars:
    auc_other_faults = roc_auc_score(y_test['Other_Faults'], anomaly_scores)
    print(f"AUC for Other_Faults (Anomaly Detection): {auc_other_faults:.4f}")

# Combine the anomaly detection predictions with your XGBoost model's predictions
combined_predictions = pd.DataFrame(y_pred_proba, columns=target_vars)
combined_predictions['Other_Faults'] = anomaly_predictions

# Evaluate the combined predictions
auc_scores = []
for target_var in target_vars:
    if target_var in y_test.columns:
        auc = roc_auc_score(y_test[target_var], combined_predictions[target_var])
        auc_scores.append(auc)
        print(f"AUC for {target_var}: {auc:.4f}")

average_auc = sum(auc_scores) / len(auc_scores)
print(f"Average AUC: {average_auc:.4f}")

AUC for Other_Faults (Anomaly Detection): 0.4931


ValueError: Length of values (19219) does not match length of index (12814)