In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('data/train.csv')

# Specify the target variables
target_vars = ['K_Scatch', 'Stains', 'Z_Scratch', 'Pastry', 'Dirtiness', 'Bumps', 'Other_Faults']


In [20]:
# Feature Engineering

# 1. Combining X_Minimum and X_Maximum

data['X_Range'] = data['X_Maximum'] - data['X_Minimum']

data['X_Midpoint'] = (data['X_Maximum'] + data['X_Minimum']) / 2

# 2. Combining Y_Minimum and Y_Maximum

data['Y_Range'] = data['Y_Maximum'] - data['Y_Minimum']

data['Y_Midpoint'] = (data['Y_Maximum'] + data['Y_Minimum']) / 2

# 3. Combining Minimum_of_Luminosity and Maximum_of_Luminosity

data['Luminosity_Range'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']

data['Luminosity_Average'] = (data['Maximum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2

# 4. Combining X_Perimeter and Y_Perimeter

data['Total_Perimeter'] = data['X_Perimeter'] + data['Y_Perimeter']

data['Perimeter_Ratio'] = data['X_Perimeter'] / data['Y_Perimeter']

# 5. Interaction terms

data['Outside_X_Index_Log_X_Index'] = data['Outside_X_Index'] * data['Log_X_Index']

# 6. Ratio features

data['Pixels_Areas_Sum_of_Luminosity_Ratio'] = data['Pixels_Areas'] / data['Sum_of_Luminosity']

# 7. Logarithmic transformations

data['Log_Pixels_Areas'] = np.log1p(data['Pixels_Areas'])

data['Log_Sum_of_Luminosity'] = np.log1p(data['Sum_of_Luminosity'])

# 8. Categorical feature encoding

#data = pd.get_dummies(data, columns=['TypeOfSteel_A300', 'TypeOfSteel_A400'])

# 9. Binning or discretization

data['Steel_Plate_Thickness_Bin'] = pd.cut(data['Steel_Plate_Thickness'], bins=[0, 50, 100, float('inf')], labels=['Low', 'Medium', 'High'])

# Label encoding for 'Steel_Plate_Thickness_Bin'

label_encoder = LabelEncoder()

data['Steel_Plate_Thickness_Bin'] = label_encoder.fit_transform(data['Steel_Plate_Thickness_Bin'])

columns_to_encode = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

existing_columns = [col for col in columns_to_encode if col in data.columns]

if existing_columns:

    data = pd.get_dummies(data, columns=existing_columns)

# Separate the features and target variables

features = data.drop(columns=target_vars)

target = data[target_vars]

# Define the top features for each target variable based on feature importance

top_features = {

    'K_Scatch': ['Outside_X_Index', 'X_Range', 'Log_X_Index', 'Steel_Plate_Thickness', 'Outside_X_Index_Log_X_Index'],

    'Stains': ['Log_Pixels_Areas', 'LogOfAreas', 'Pixels_Areas', 'Steel_Plate_Thickness', 'SigmoidOfAreas'],

    'Z_Scratch': ['Length_of_Conveyer', 'Steel_Plate_Thickness_Bin', 'Steel_Plate_Thickness', 'TypeOfSteel_A300_0', 'TypeOfSteel_A300_1', 'Bumps', 'Pastry'],

    'Pastry': ['Length_of_Conveyer', 'Orientation_Index', 'Edges_Y_Index', 'Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'Bumps', 'Dirtiness', 'K_Scatch', 'Z_Scratch', 'Other_Faults'],

    'Dirtiness': ['Orientation_Index', 'Edges_Index', 'Steel_Plate_Thickness', 'Luminosity_Index', 'Length_of_Conveyer', 'Bumps', 'Z_Scratch', 'Pastry', 'K_Scatch', 'Other_Faults'],

    'Bumps': ['Luminosity_Index', 'Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'K_Scatch', 'Z_Scratch', 'Pastry', 'Dirtiness', 'Stains', 'Other_Faults'],

    'Other_Faults': ['Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'Edges_Index', 'K_Scatch', 'Bumps', 'Z_Scratch', 'Pastry', 'Stains', 'Dirtiness']

}

# Combine all the top features into a single list, excluding target variables
all_top_features = list(set(feature for feature_list in top_features.values() for feature in feature_list if feature in features.columns))
# Select the top features from the dataset
selected_features = features[all_top_features]

# Perform feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(selected_features)
scaled_features_df = pd.DataFrame(scaled_features, columns=selected_features.columns)

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_features_df, target, test_size=0.2, random_state=42)

# Create an XGBoost classifier
xgb_classifier = XGBClassifier(n_estimators=100, objective='binary:logistic', eval_metric='auc', tree_method='gpu_hist', random_state=42, device="cuda")

# Train the XGBoost classifier
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_proba = xgb_classifier.predict_proba(X_test)

# Evaluate the model's performance using average AUC
auc_scores = []
for i, target_var in enumerate(target_vars):
    auc = roc_auc_score(y_test[target_var], y_pred_proba[:, i])
    auc_scores.append(auc)
    print(f"AUC for {target_var}: {auc:.4f}")

print(f"\nAverage AUC: {np.mean(auc_scores):.4f}")


    E.g. tree_method = "hist", device = "cuda"



AUC for K_Scatch: 0.9852
AUC for Stains: 0.9900
AUC for Z_Scratch: 0.9532
AUC for Pastry: 0.8499
AUC for Dirtiness: 0.8875
AUC for Bumps: 0.7869
AUC for Other_Faults: 0.6732

Average AUC: 0.8751



    E.g. tree_method = "hist", device = "cuda"



In [23]:
# Load the test dataset
test_data = pd.read_csv('data/test.csv')

# 1. Combining X_Minimum and X_Maximum
test_data['X_Range'] = test_data['X_Maximum'] - test_data['X_Minimum']
test_data['X_Midpoint'] = (test_data['X_Maximum'] + test_data['X_Minimum']) / 2

# 2. Combining Y_Minimum and Y_Maximum
test_data['Y_Range'] = test_data['Y_Maximum'] - test_data['Y_Minimum']
test_data['Y_Midpoint'] = (test_data['Y_Maximum'] + test_data['Y_Minimum']) / 2

# 3. Combining Minimum_of_Luminosity and Maximum_of_Luminosity
test_data['Luminosity_Range'] = test_data['Maximum_of_Luminosity'] - test_data['Minimum_of_Luminosity']
test_data['Luminosity_Average'] = (test_data['Maximum_of_Luminosity'] + test_data['Minimum_of_Luminosity']) / 2

# 4. Combining X_Perimeter and Y_Perimeter
test_data['Total_Perimeter'] = test_data['X_Perimeter'] + test_data['Y_Perimeter']
test_data['Perimeter_Ratio'] = test_data['X_Perimeter'] / test_data['Y_Perimeter']

# 5. Interaction terms
test_data['Outside_X_Index_Log_X_Index'] = test_data['Outside_X_Index'] * test_data['Log_X_Index']

# 6. Ratio features
test_data['Pixels_Areas_Sum_of_Luminosity_Ratio'] = test_data['Pixels_Areas'] / test_data['Sum_of_Luminosity']

# 7. Logarithmic transformations
test_data['Log_Pixels_Areas'] = np.log1p(test_data['Pixels_Areas'])
test_data['Log_Sum_of_Luminosity'] = np.log1p(test_data['Sum_of_Luminosity'])

# 8. Categorical feature encoding
if 'TypeOfSteel_A300' in test_data.columns and 'TypeOfSteel_A400' in test_data.columns:
    test_data = pd.get_dummies(test_data, columns=['TypeOfSteel_A300', 'TypeOfSteel_A400'])

# 9. Binning or discretization
test_data['Steel_Plate_Thickness_Bin'] = pd.cut(test_data['Steel_Plate_Thickness'], bins=[0, 50, 100, float('inf')], labels=['Low', 'Medium', 'High'])

# Label encoding for 'Steel_Plate_Thickness_Bin'
label_encoder = LabelEncoder()
test_data['Steel_Plate_Thickness_Bin'] = label_encoder.fit_transform(test_data['Steel_Plate_Thickness_Bin'])

# Select only the features used in training
test_features = test_data[all_top_features]

# Perform feature scaling on the test data
scaled_test_features = scaler.transform(test_features)
scaled_test_features_df = pd.DataFrame(scaled_test_features, columns=test_features.columns)

# Make predictions on the test data
test_predictions = xgb_classifier.predict_proba(scaled_test_features_df)
test_predictions_df = pd.DataFrame(test_predictions, columns=target_vars)

# Combine the test predictions with the 'id' column
output_data = pd.concat([test_data['id'], test_predictions_df], axis=1)

# Save the output data to 'output.csv'
output_data.to_csv('output.csv', index=False)

NameError: name 'rf_models' is not defined