In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib

In [2]:
# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Split data into features and targets
X_train = train_data.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                           'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y_train = train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                      'Dirtiness', 'Bumps', 'Other_Faults']]

In [18]:
y_train.head()

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1


In [4]:
test_ids = test_data['id']
test_features = test_data.drop('id', axis=1)

In [5]:
# Initialize classifiers
rf_classifier = RandomForestClassifier()
lgbm_classifier = LGBMClassifier()
xgb_classifier = XGBClassifier()
catboost_classifier = CatBoostClassifier()
extratrees_classifier = ExtraTreesClassifier()

In [6]:
# Initialize multi-output classifiers
rf_multi_classifier = MultiOutputClassifier(rf_classifier, 
                                            n_jobs=-1)
lgbm_multi_classifier = MultiOutputClassifier(lgbm_classifier, 
                                              n_jobs=-1)
xgb_multi_classifier = MultiOutputClassifier(xgb_classifier, 
                                             n_jobs=-1)
catboost_multi_classifier = MultiOutputClassifier(catboost_classifier, 
                                                  n_jobs=-1)
extratrees_multi_classifier = MultiOutputClassifier(extratrees_classifier, 
                                                    n_jobs=-1)

In [7]:
# Perform cross-validation to evaluate models
rf_cv_scores = cross_val_score(rf_multi_classifier, X_train, y_train, 
                               cv=5, scoring='accuracy')
lgbm_cv_scores = cross_val_score(lgbm_multi_classifier, X_train, y_train, 
                                 cv=5, scoring='accuracy')
xgb_cv_scores = cross_val_score(xgb_multi_classifier, X_train, y_train, 
                                cv=5, scoring='accuracy')
catboost_cv_scores = cross_val_score(catboost_multi_classifier, X_train, y_train, 
                                     cv=5, scoring='accuracy')
extratrees_cv_scores = cross_val_score(extratrees_multi_classifier, X_train, y_train, 
                                       cv=5, scoring='accuracy')

In [8]:
# Print mean cross-validation scores
print("Random Forest CV Accuracy:", rf_cv_scores.mean())
print("LightGBM CV Accuracy:", lgbm_cv_scores.mean())
print("XGBoost CV Accuracy:", xgb_cv_scores.mean())
print("CatBoost CV Accuracy:", catboost_cv_scores.mean())
print("ExtraTrees CV Accuracy:", extratrees_cv_scores.mean())

Random Forest CV Accuracy: 0.37754313896396086
LightGBM CV Accuracy: 0.4178680753389476
XGBoost CV Accuracy: 0.4202095489373086
CatBoost CV Accuracy: 0.4206776757773841
ExtraTrees CV Accuracy: 0.37244352543903897


In [9]:
# Choose the best model
best_model = max([
    (rf_cv_scores.mean(), 'Random Forest'),
    (lgbm_cv_scores.mean(), 'LightGBM'),
    (xgb_cv_scores.mean(), 'XGBoost'),
    (catboost_cv_scores.mean(), 'CatBoost'),
    (extratrees_cv_scores.mean(), 'ExtraTrees')
])

print("Best Model:", best_model[1])

Best Model: CatBoost


In [10]:
# Train the best model on the entire training data
best_classifier = None
if best_model[1] == 'Random Forest':
    best_classifier = rf_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'LightGBM':
    best_classifier = lgbm_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'XGBoost':
    best_classifier = xgb_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'CatBoost':
    best_classifier = catboost_multi_classifier.fit(X_train, y_train)
elif best_model[1] == 'ExtraTrees':
    best_classifier = extratrees_multi_classifier.fit(X_train, y_train)

In [11]:
# Save the best model
model = joblib.dump(best_classifier, 'best_model.joblib')

In [12]:
# Make predictions
best_test_probs = best_classifier.predict_proba(test_features)

In [19]:
best_test_probs

[array([[4.78924574e-01, 5.21075426e-01],
        [8.30218024e-01, 1.69781976e-01],
        [9.99165570e-01, 8.34430024e-04],
        ...,
        [9.99932357e-01, 6.76425352e-05],
        [7.06132621e-01, 2.93867379e-01],
        [9.99605455e-01, 3.94545104e-04]]),
 array([[9.99632538e-01, 3.67462216e-04],
        [9.89439792e-01, 1.05602085e-02],
        [9.71055532e-01, 2.89444682e-02],
        ...,
        [9.99900513e-01, 9.94873992e-05],
        [9.97153232e-01, 2.84676783e-03],
        [9.93315485e-01, 6.68451487e-03]]),
 array([[9.99339896e-01, 6.60103736e-04],
        [9.96341484e-01, 3.65851575e-03],
        [9.71230881e-01, 2.87691186e-02],
        ...,
        [6.02732545e-02, 9.39726745e-01],
        [9.90955843e-01, 9.04415683e-03],
        [1.40990871e-01, 8.59009129e-01]]),
 array([[9.99993531e-01, 6.46855211e-06],
        [9.99950413e-01, 4.95869746e-05],
        [9.99538759e-01, 4.61241225e-04],
        ...,
        [9.99995099e-01, 4.90075095e-06],
        [9.9999553

In [13]:
# Generate submission file
submission_df = pd.DataFrame({'id': test_ids})
# Iterate over each target
for i, target in enumerate(y_train.columns):
    # Fetch the probability of the positive class for each target
    # Store the positive probabilities in the respective target column
    submission_df[target] = best_test_probs[i][:, 1]  # Probability of positive class

submission_df.to_csv('submission.csv', index=False)

In [21]:
import time
# Generate submission file
submission_df = pd.DataFrame({'id': test_ids})
# Iterate over each target
for i, target in enumerate(y_train.columns):
    # Fetch the probability of the positive class for each target
    # Store the positive probabilities in the respective target column
    submission_df[target] = best_test_probs[i][:, 1]  # Probability of positive class
    time.sleep(6)
    print(submission_df) 
submission_df.to_csv('submission.csv', index=False)

          id    Pastry
0      19219  0.521075
1      19220  0.169782
2      19221  0.000834
3      19222  0.085488
4      19223  0.001323
...      ...       ...
12809  32028  0.048068
12810  32029  0.065993
12811  32030  0.000068
12812  32031  0.293867
12813  32032  0.000395

[12814 rows x 2 columns]
          id    Pastry  Z_Scratch
0      19219  0.521075   0.000367
1      19220  0.169782   0.010560
2      19221  0.000834   0.028944
3      19222  0.085488   0.000250
4      19223  0.001323   0.000322
...      ...       ...        ...
12809  32028  0.048068   0.060690
12810  32029  0.065993   0.001254
12811  32030  0.000068   0.000099
12812  32031  0.293867   0.002847
12813  32032  0.000395   0.006685

[12814 rows x 3 columns]
          id    Pastry  Z_Scratch  K_Scatch
0      19219  0.521075   0.000367  0.000660
1      19220  0.169782   0.010560  0.003659
2      19221  0.000834   0.028944  0.028769
3      19222  0.085488   0.000250  0.000099
4      19223  0.001323   0.000322  0.000383


In [20]:
y_train.columns

Index(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
       'Other_Faults'],
      dtype='object')

In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from catboost import CatBoostClassifier

def load_train_file():
    train_file = filedialog.askopenfilename(title="Select Train File")
    if train_file:
        train_entry.delete(0, tk.END)
        train_entry.insert(0, train_file)
        messagebox.showinfo("File Loaded", "Train file loaded successfully!")
    else:
        messagebox.showerror("Error", "Please select a train file.")

def load_test_file():
    test_file = filedialog.askopenfilename(title="Select Test File")
    if test_file:
        test_entry.delete(0, tk.END)
        test_entry.insert(0, test_file)
        messagebox.showinfo("File Loaded", "Test file loaded successfully!")
    else:
        messagebox.showerror("Error", "Please select a test file.")

def train_and_predict():
    train_file = train_entry.get()
    test_file = test_entry.get()
    
    if not train_file or not test_file:
        messagebox.showerror("Error", "Please select both train and test files.")
        return
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    
    X_train = train_data.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                               'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
    y_train = train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
                          'Dirtiness', 'Bumps', 'Other_Faults']]
    
    catboost_classifier = CatBoostClassifier()

    best_model = MultiOutputClassifier(catboost_classifier, n_jobs=-1).fit(X_train, y_train)

    test_ids = test_data['id']
    test_features = test_data.drop('id', axis=1)
    best_test_probs = best_model.predict_proba(test_features)

    submission_df = pd.DataFrame({'id': test_ids})
    for i, target in enumerate(train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']].columns):
        submission_df[target] = best_test_probs[i][:, 1]

    submission_df.to_csv('submission.csv', index=False)
    messagebox.showinfo("Info", "Prediction completed and submission file generated!")

root = tk.Tk()
root.title("Fault Classification Model")

train_frame = tk.Frame(root)
train_frame.pack(pady=10)

train_label = tk.Label(train_frame, text="Train File:")
train_label.grid(row=0, column=0, padx=5, pady=5)

train_entry = tk.Entry(train_frame, width=40)
train_entry.grid(row=0, column=1, padx=5, pady=5)

train_button = tk.Button(train_frame, text="Browse", command=load_train_file)
train_button.grid(row=0, column=2, padx=5, pady=5)

test_frame = tk.Frame(root)
test_frame.pack(pady=10)

test_label = tk.Label(test_frame, text="Test File:")
test_label.grid(row=0, column=0, padx=5, pady=5)

test_entry = tk.Entry(test_frame, width=40)
test_entry.grid(row=0, column=1, padx=5, pady=5)

test_button = tk.Button(test_frame, text="Browse", command=load_test_file)
test_button.grid(row=0, column=2, padx=5, pady=5)

predict_button = tk.Button(root, text="Predict", command=train_and_predict)
predict_button.pack(pady=5)

root.mainloop()
