In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm



In [6]:
data = pd.read_csv('data/train.csv')
pre_predictions = pd.read_csv('output.csv')

target_vars = ['K_Scatch', 'Stains', 'Z_Scratch', 'Pastry', 'Dirtiness', 'Bumps', 'Other_Faults']


In [7]:
def gen_features(data):
    # Combining X_Minimum and X_Maximum
    data['X_Range'] = data['X_Maximum'] - data['X_Minimum']
    data['X_Midpoint'] = (data['X_Maximum'] + data['X_Minimum']) / 2

    # Combining Y_Minimum and Y_Maximum
    data['Y_Range'] = data['Y_Maximum'] - data['Y_Minimum']
    data['Y_Midpoint'] = (data['Y_Maximum'] + data['Y_Minimum']) / 2

    # Combining Minimum_of_Luminosity and Maximum_of_Luminosity
    data['Luminosity_Range'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']
    data['Luminosity_Average'] = (data['Maximum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2

    # Combining X_Perimeter and Y_Perimeter
    data['Total_Perimeter'] = data['X_Perimeter'] + data['Y_Perimeter']
    data['Perimeter_Ratio'] = data['X_Perimeter'] / data['Y_Perimeter']

    # Interaction terms
    data['Outside_X_Index_Log_X_Index'] = data['Outside_X_Index'] * data['Log_X_Index']

    # Ratio features
    data['Pixels_Areas_Sum_of_Luminosity_Ratio'] = data['Pixels_Areas'] / data['Sum_of_Luminosity']

    # Logarithmic transformations
    data['Log_Pixels_Areas'] = np.log1p(data['Pixels_Areas'])
    data['Log_Sum_of_Luminosity'] = np.log1p(data['Sum_of_Luminosity'])

    # Categorical feature encoding
    if 'TypeOfSteel_A300' in data.columns and 'TypeOfSteel_A400' in data.columns:
        data = pd.get_dummies(data, columns=['TypeOfSteel_A300', 'TypeOfSteel_A400'])

    # Binning or discretization
    data['Steel_Plate_Thickness_Bin'] = pd.cut(data['Steel_Plate_Thickness'], bins=[0, 50, 100, float('inf')], labels=['Low', 'Medium', 'High'])

    # Label encoding for 'Steel_Plate_Thickness_Bin'
    label_encoder = LabelEncoder()
    data['Steel_Plate_Thickness_Bin'] = label_encoder.fit_transform(data['Steel_Plate_Thickness_Bin'])

    return data

data = gen_features(data)

columns_to_encode = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

existing_columns = [col for col in columns_to_encode if col in data.columns]

if existing_columns:

    data = pd.get_dummies(data, columns=existing_columns)

# Separate the features and target variables

features = data.drop(columns=target_vars)

target = data[target_vars]

# Define the top features for each target variable based on feature importance

top_features = {

    'K_Scatch': ['Outside_X_Index', 'X_Range', 'Log_X_Index', 'Steel_Plate_Thickness', 'Outside_X_Index_Log_X_Index'],

    'Stains': ['Log_Pixels_Areas', 'LogOfAreas', 'Pixels_Areas', 'Steel_Plate_Thickness', 'SigmoidOfAreas'],

    'Z_Scratch': ['Length_of_Conveyer', 'Steel_Plate_Thickness_Bin', 'Steel_Plate_Thickness', 'TypeOfSteel_A300_0', 'TypeOfSteel_A300_1', 'Bumps', 'Pastry'],

    'Pastry': ['Length_of_Conveyer', 'Orientation_Index', 'Edges_Y_Index', 'Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'Bumps', 'Dirtiness', 'K_Scatch', 'Z_Scratch', 'Other_Faults'],

    'Dirtiness': ['Orientation_Index', 'Edges_Index', 'Steel_Plate_Thickness', 'Luminosity_Index', 'Length_of_Conveyer', 'Bumps', 'Z_Scratch', 'Pastry', 'K_Scatch', 'Other_Faults'],

    'Bumps': ['Luminosity_Index', 'Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'K_Scatch', 'Z_Scratch', 'Pastry', 'Dirtiness', 'Stains', 'Other_Faults'],

    'Other_Faults': ['Empty_Index', 'Pixels_Areas_Sum_of_Luminosity_Ratio', 'Edges_Index', 'K_Scatch', 'Bumps', 'Z_Scratch', 'Pastry', 'Stains', 'Dirtiness']

}

# Combine all the top features into a single list, excluding target variables
all_top_features = list(set(feature for feature_list in top_features.values() for feature in feature_list if feature in features.columns))
# Select the top features from the dataset
selected_features = features[all_top_features]

# Perform feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(selected_features)
scaled_features_df = pd.DataFrame(scaled_features, columns=selected_features.columns)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features_df, target, test_size=0.2, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

In [17]:
# Create a neural network model
class MultiLabelClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout_rate=0.2):
        super(MultiLabelClassifier, self).__init__()
        self.hidden1 = nn.Linear(input_size, hidden_size)
        self.hidden2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.hidden1(x))
        x = self.dropout(x)
        x = self.relu(self.hidden2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.output(x))
        return x

input_size = X_train.shape[1]
hidden_size = 128
num_classes = len(target_vars)
dropout_rate = 0.2
model = MultiLabelClassifier(input_size, hidden_size, num_classes, dropout_rate).to(device)

# Define the loss function, optimizer, and learning rate scheduler
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, verbose=True)

# Train the model with early stopping
num_epochs = 50
batch_size = 32
patience = 10
best_val_auc = 0
counter = 0

for epoch in range(num_epochs):
    permutation = torch.randperm(X_train_tensor.size()[0])
    
    with tqdm(range(0, X_train_tensor.size()[0], batch_size), unit="batch") as progress_bar:
        for i in progress_bar:
            progress_bar.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
            
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_tensor[indices], y_train_tensor[indices]

            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            progress_bar.set_postfix(loss=loss.item())
    
    # Evaluate the model on the validation set
    with torch.no_grad():
        y_val_proba = model(X_test_tensor).cpu().numpy()
    
    val_auc = roc_auc_score(y_test, y_val_proba, average='macro')
    
    # Update the learning rate based on validation AUC
    scheduler.step(val_auc)
    
    # Early stopping
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"\nEarly stopping at epoch {epoch+1}")
            break

# Make predictions on the test set using the best model
with torch.no_grad():
    y_pred_proba = model(X_test_tensor).cpu().numpy()

# Evaluate the model's performance using average AUC
auc_scores = []
for i, target_var in enumerate(target_vars):
    auc = roc_auc_score(y_test.values[:, i], y_pred_proba[:, i])
    auc_scores.append(auc)
    print(f"AUC for {target_var}: {auc:.4f}")

print(f"\nAverage AUC: {np.mean(auc_scores):.4f}")



  0%|          | 0/481 [00:00<?, ?batch/s]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x128 and 64x32)