In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
url = "car.data"  
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
data = pd.read_csv(url, names=column_names)

# Display the first few rows of the dataset
print(data.head())

# Convert categorical variables to numerical using Label Encoding
le = LabelEncoder()
data_encoded = data.apply(le.fit_transform)

# Split the data into features (X) and target variable (y)
X = data_encoded.drop('class', axis=1)
y = data_encoded['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost classifier
model = XGBClassifier(objective='multi:softmax', num_class=len(data['class'].unique()))
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

  buying  maint doors persons lug_boot safety  class
0  vhigh  vhigh     2       2    small    low  unacc
1  vhigh  vhigh     2       2    small    med  unacc
2  vhigh  vhigh     2       2    small   high  unacc
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97        83
           1       0.65      1.00      0.79        11
           2       1.00      1.00      1.00       235
           3       1.00      0.82      0.90        17

    accuracy                           0.98       346
   macro avg       0.91      0.94      0.91       346
weighted avg       0.99      0.98      0.98       346



In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Assuming 'car.data' is the name of your file
# If the file is in a different format or location, please adjust accordingly
column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
data = pd.read_csv('car.data', names=column_names)

# Convert '5more' to 5 in the 'doors' column
data['doors'] = data['doors'].replace('5more', 5)

# Convert entire DataFrame to string
data = data.astype(str)

# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()
X_encoded = data.copy()

# Apply label encoding only to categorical columns
categorical_columns = X_encoded.select_dtypes(include=['object']).columns
X_encoded[categorical_columns] = X_encoded[categorical_columns].apply(label_encoder.fit_transform)

# Separate features and target
X = X_encoded.drop('class', axis=1)
y = X_encoded['class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create CatBoost classifier
classifier = CatBoostClassifier(iterations=150, depth=5, learning_rate=0.3, loss_function='MultiClass', verbose=15)
classifier.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = classifier.predict(X_test)
print('Classification Report:\n', classification_report(y_test, y_pred))


0:	learn: 1.0656403	total: 2.31ms	remaining: 345ms
15:	learn: 0.2187014	total: 28.7ms	remaining: 240ms
30:	learn: 0.1216388	total: 52.9ms	remaining: 203ms
45:	learn: 0.0836041	total: 79ms	remaining: 179ms
60:	learn: 0.0613959	total: 102ms	remaining: 149ms
75:	learn: 0.0468600	total: 124ms	remaining: 121ms
90:	learn: 0.0392031	total: 147ms	remaining: 95.6ms
105:	learn: 0.0321159	total: 171ms	remaining: 71.1ms
120:	learn: 0.0272209	total: 198ms	remaining: 47.4ms
135:	learn: 0.0235938	total: 223ms	remaining: 23ms
149:	learn: 0.0209672	total: 244ms	remaining: 0us
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.96        83
           1       0.69      1.00      0.81        11
           2       1.00      1.00      1.00       235
           3       1.00      0.94      0.97        17

    accuracy                           0.98       346
   macro avg       0.92      0.97      0.94       346
weighted avg       0.99      0

In [35]:
import numpy as np

# Compare individual accuracy ratings
class_labels = data['class'].unique()
for label in class_labels:
    indices_xgb = np.where(y_xgb_pred == label)[0]
    indices_catboost = np.where(y_catboost_pred == label)[0]
    
    if len(indices_xgb) > 0:
        accuracy_xgb_label = accuracy_score(y_test.iloc[indices_xgb], y_xgb_pred[indices_xgb])
        print(f"\nClass: {label}")
        print("XGBoost Classifier Individual Accuracy:", accuracy_xgb_label)
    else:
        print(f"\nClass: {label}")
        print("No instances of this class in the XGBoost test set.")
    
    if len(indices_catboost) > 0:
        accuracy_catboost_label = accuracy_score(y_test.iloc[indices_catboost], y_catboost_pred[indices_catboost])
        print("CatBoost Classifier Individual Accuracy:", accuracy_catboost_label)
    else:
        print("No instances of this class in the CatBoost test set.")


Class: unacc
No instances of this class in the XGBoost test set.
No instances of this class in the CatBoost test set.

Class: acc
No instances of this class in the XGBoost test set.
No instances of this class in the CatBoost test set.

Class: vgood
No instances of this class in the XGBoost test set.
No instances of this class in the CatBoost test set.

Class: good
No instances of this class in the XGBoost test set.
No instances of this class in the CatBoost test set.


  indices_xgb = np.where(y_xgb_pred == label)[0]
  indices_catboost = np.where(y_catboost_pred == label)[0]
