In [1]:
# Import necessary libraries for data manipulation and visualization
import pandas as pd  # For data manipulation using DataFrames
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For plotting graphs and charts
import plotly.express as px  # For interactive visualizations
import seaborn as sns  # For statistical data visualization

# Import necessary libraries for machine learning tasks
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV  # For data splitting and hyperparameter tuning
from sklearn.preprocessing import StandardScaler  # For feature scaling
from xgboost import XGBClassifier  # For XGBoost classifier
from sklearn.model_selection import cross_val_score  # For cross-validation
from sklearn.preprocessing import LabelEncoder  # For label encoding
from sklearn.multioutput import MultiOutputClassifier  # For multi-output classification
from sklearn.metrics import accuracy_score, roc_auc_score  # For model evaluation metrics

# Ignore warnings to improve code readability
import warnings
warnings.filterwarnings('ignore')


In [2]:

# Read the training data from the CSV file
train_data = pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv')

# Read the test data from the CSV file
test_data = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')


In [5]:
# Check for missing values in the DataFrame and sum the total count of missing values
missing_values_count = train_data.isnull().sum().sum()

# Print the total count of missing values in the DataFrame
print('Total missing values in train_data:', missing_values_count)


Total missing values in train_data: 0


In [6]:
# Check for missing values in the DataFrame and sum the total count of missing values
missing_values_count = test_data.isnull().sum().sum()

# Print the total count of missing values in the DataFrame
print('Total missing values in test_data:', missing_values_count)


Total missing values in test_data: 0


In [7]:
# Display concise information about the DataFrame train_df
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19219 entries, 0 to 19218
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     19219 non-null  int64  
 1   X_Minimum              19219 non-null  int64  
 2   X_Maximum              19219 non-null  int64  
 3   Y_Minimum              19219 non-null  int64  
 4   Y_Maximum              19219 non-null  int64  
 5   Pixels_Areas           19219 non-null  int64  
 6   X_Perimeter            19219 non-null  int64  
 7   Y_Perimeter            19219 non-null  int64  
 8   Sum_of_Luminosity      19219 non-null  int64  
 9   Minimum_of_Luminosity  19219 non-null  int64  
 10  Maximum_of_Luminosity  19219 non-null  int64  
 11  Length_of_Conveyer     19219 non-null  int64  
 12  TypeOfSteel_A300       19219 non-null  int64  
 13  TypeOfSteel_A400       19219 non-null  int64  
 14  Steel_Plate_Thickness  19219 non-null  int64  
 15  Ed

In [8]:
# Display concise information about the DataFrame train_df
test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12814 entries, 0 to 12813
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     12814 non-null  int64  
 1   X_Minimum              12814 non-null  int64  
 2   X_Maximum              12814 non-null  int64  
 3   Y_Minimum              12814 non-null  int64  
 4   Y_Maximum              12814 non-null  int64  
 5   Pixels_Areas           12814 non-null  int64  
 6   X_Perimeter            12814 non-null  int64  
 7   Y_Perimeter            12814 non-null  int64  
 8   Sum_of_Luminosity      12814 non-null  int64  
 9   Minimum_of_Luminosity  12814 non-null  int64  
 10  Maximum_of_Luminosity  12814 non-null  int64  
 11  Length_of_Conveyer     12814 non-null  int64  
 12  TypeOfSteel_A300       12814 non-null  int64  
 13  TypeOfSteel_A400       12814 non-null  int64  
 14  Steel_Plate_Thickness  12814 non-null  int64  
 15  Ed

In [10]:
# Count the number of duplicate rows in the DataFrame train_data
duplicate_count = train_data.duplicated().sum()

# Print the count of duplicate rows
print('Number of duplicate rows in train_data:', duplicate_count)


Number of duplicate rows in train_data: 0


In [11]:
# Count the number of duplicate rows in the DataFrame test_data
duplicate_count = test_data.duplicated().sum()

# Print the count of duplicate rows
print('Number of duplicate rows in test_data:', duplicate_count)


Number of duplicate rows in test_data: 0


In [12]:
# Preprocess the data by separating features (X) and target variables (y)
X = train_data.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y = train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [14]:
# Import necessary libraries
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Call the model and train it
model = MultiOutputClassifier(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))

# Define the hyperparameter grid for RandomizedSearchCV
param_grid = {
    'estimator__max_depth': [3, 5, 6, 7],  # Maximum depth of each tree
    'estimator__learning_rate': [0.01, 0.1, 0.2, 0.3],  # Learning rate for boosting
    'estimator__n_estimators': [100, 300, 500],  # Number of boosting rounds
    'estimator__min_child_weight': [1, 3, 5],  # Minimum sum of instance weight needed in a child
    'estimator__gamma': [0, 0.01, 0.1, 1],  # Minimum loss reduction required to make a further partition
    'estimator__subsample': [0.5, 0.6, 0.7],  # Subsample ratio of the training instances
    'estimator__colsample_bytree': [0.5, 0.7, 0.9],  # Subsample ratio of columns when constructing each tree
}

# Randomize the search with cross-validation
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, scoring='roc_auc', 
                                   n_jobs=-1, cv=3, verbose=3, random_state=42)
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [16]:
# Print the best parameters found during RandomizedSearchCV
print('The best parameters are:', random_search.best_params_)

# Use the best estimator from RandomizedSearchCV
best_model = random_search.best_estimator_

# Predict probabilities for the test set using the best model
y_pred_proba = best_model.predict_proba(X_test)

# Perform predictions on the test data
test_scaled = scaler.transform(test_data.drop(['id'], axis=1))
predictions = best_model.predict_proba(test_scaled)


The best parameters are: {'estimator__subsample': 0.6, 'estimator__n_estimators': 500, 'estimator__min_child_weight': 1, 'estimator__max_depth': 5, 'estimator__learning_rate': 0.01, 'estimator__gamma': 0.01, 'estimator__colsample_bytree': 0.5}


In [18]:
# Create a DataFrame containing predicted probabilities for each class
probs = pd.DataFrame({class_: predictions[i][:, 1] for i, class_ in enumerate(y.columns)})

# Combine the predicted probabilities with the 'id' column from test_df
submission = test_data[['id']].join(probs)

# Save the submission DataFrame to a CSV file without including the index
submission.to_csv('submission_exp.csv', index=False)
