In [2]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler


In [3]:
Car = pd.read_csv('car.data', sep=",")
Car.columns = [
    "buying_price",           # vhigh
    "maintenance_price",      # vhigh.1
    "doors",                  # 2
    "persons",                # 2.1
    "luggage_boot_size",      # small
    "safety",                 # low
    "car_acceptability"       # unacc
]


## Data Cleaning

In [4]:
Car['car_acceptability'] = Car['car_acceptability'].replace({
    'unacc': 0,  # Unacceptable
    'acc': 1,    # Acceptable
    'good': 1,   # Acceptable
    'vgood': 1  # Acceptable
})


  Car['car_acceptability'] = Car['car_acceptability'].replace({


In [5]:
Car['car_acceptability'].value_counts()

car_acceptability
0    1209
1     518
Name: count, dtype: int64

In [6]:
label_encoders = {}
for column in Car.columns:
    if Car[column].dtype == 'object':  # Check if column is categorical
        le = LabelEncoder()
        Car[column] = le.fit_transform(Car[column])
        label_encoders[column] = le  # Store the encoder for future reference

# 3. Verify encoding
print("\nEncoded Data (first 5 rows):")
print(Car.head())



Encoded Data (first 5 rows):
   buying_price  maintenance_price  doors  persons  luggage_boot_size  safety  \
0             3                  3      0        0                  2       2   
1             3                  3      0        0                  2       0   
2             3                  3      0        0                  1       1   
3             3                  3      0        0                  1       2   
4             3                  3      0        0                  1       0   

   car_acceptability  
0                  0  
1                  0  
2                  0  
3                  0  
4                  0  


In [7]:
Car

Unnamed: 0,buying_price,maintenance_price,doors,persons,luggage_boot_size,safety,car_acceptability
0,3,3,0,0,2,2,0
1,3,3,0,0,2,0,0
2,3,3,0,0,1,1,0
3,3,3,0,0,1,2,0
4,3,3,0,0,1,0,0
...,...,...,...,...,...,...,...
1722,1,1,3,2,1,2,1
1723,1,1,3,2,1,0,1
1724,1,1,3,2,0,1,0
1725,1,1,3,2,0,2,1


# Random Forest

#### Hyperparam Tuning

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Features and target
X = Car.drop("car_acceptability", axis=1)  # Replace with your dataset
y = Car["car_acceptability"]               # Replace with your dataset

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_features": ["sqrt", "log2", None],
    "criterion": ["gini", "entropy"]
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,  # Number of combinations to try
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_resampled, y_resampled)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate on test set using the best model
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))




Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 400, 'max_features': None, 'criterion': 'entropy'}
Test Set Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       237
           1       1.00      1.00      1.00       109

    accuracy                           1.00       346
   macro avg       1.00      1.00      1.00       346
weighted avg       1.00      1.00      1.00       346



### 80/20 Split

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Features and target
X = Car.drop("car_acceptability", axis=1)  # Features
y = Car["car_acceptability"]               # Target

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=400, class_weight='balanced', max_features = 'log2', criterion = 'entropy', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 99.13%
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       239
           1       1.00      0.97      0.99       107

    accuracy                           0.99       346
   macro avg       0.99      0.99      0.99       346
weighted avg       0.99      0.99      0.99       346


--- Trial 2 ---




Accuracy: 95.95%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       239
           1       0.93      0.93      0.93       107

    accuracy                           0.96       346
   macro avg       0.95      0.95      0.95       346
weighted avg       0.96      0.96      0.96       346


--- Trial 3 ---




Accuracy: 98.84%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       239
           1       0.97      0.99      0.98       107

    accuracy                           0.99       346
   macro avg       0.98      0.99      0.99       346
weighted avg       0.99      0.99      0.99       346


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.991329             1.000000          0.971963            0.985782
1      2  0.959538             0.934579          0.934579            0.934579
2      3  0.988439             0.972477          0.990654            0.981481
Average Accuracy across the 3 trials: 97.98%


### 50/50

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Features and target
X = Car.drop("car_acceptability", axis=1)  # Features
y = Car["car_acceptability"]               # Target

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.5, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=400, class_weight='balanced', max_features = 'log2', criterion = 'entropy', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 98.61%
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       595
           1       0.97      0.98      0.98       269

    accuracy                           0.99       864
   macro avg       0.98      0.98      0.98       864
weighted avg       0.99      0.99      0.99       864


--- Trial 2 ---




Accuracy: 98.61%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       605
           1       0.96      0.99      0.98       259

    accuracy                           0.99       864
   macro avg       0.98      0.99      0.98       864
weighted avg       0.99      0.99      0.99       864


--- Trial 3 ---




Accuracy: 97.69%
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       594
           1       0.95      0.98      0.96       270

    accuracy                           0.98       864
   macro avg       0.97      0.98      0.97       864
weighted avg       0.98      0.98      0.98       864


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.986111             0.974170          0.981413            0.977778
1      2  0.986111             0.962547          0.992278            0.977186
2      3  0.976852             0.949640          0.977778            0.963504
Average Accuracy across the 3 trials: 98.30%


### 20/80

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Features and target
X = Car.drop("car_acceptability", axis=1)  # Features
y = Car["car_acceptability"]               # Target

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.8, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=400, class_weight='balanced', max_features = 'log2', criterion = 'entropy', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 95.15%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       956
           1       0.93      0.91      0.92       426

    accuracy                           0.95      1382
   macro avg       0.94      0.94      0.94      1382
weighted avg       0.95      0.95      0.95      1382


--- Trial 2 ---




Accuracy: 95.22%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       975
           1       0.91      0.93      0.92       407

    accuracy                           0.95      1382
   macro avg       0.94      0.94      0.94      1382
weighted avg       0.95      0.95      0.95      1382


--- Trial 3 ---




Accuracy: 94.07%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       963
           1       0.90      0.90      0.90       419

    accuracy                           0.94      1382
   macro avg       0.93      0.93      0.93      1382
weighted avg       0.94      0.94      0.94      1382


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.951520             0.928401          0.913146            0.920710
1      2  0.952243             0.912833          0.926290            0.919512
2      3  0.940666             0.902148          0.902148            0.902148
Average Accuracy across the 3 trials: 94.81%


## SVM

#### Hyperparam Tuning

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd

# Separate features and target
X = Car.drop("car_acceptability", axis=1)  # Features
y = Car["car_acceptability"]               # Target

# Define the parameter grid for tuning
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  # Regularization parameter
}

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the SVM model
svm_model = SVC(class_weight='balanced', random_state=42)

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_grid,
    n_iter=20,  # Number of random combinations to try
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'C': 1000}
Accuracy: 99.71%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       237
           1       0.99      1.00      1.00       109

    accuracy                           1.00       346
   macro avg       1.00      1.00      1.00       346
weighted avg       1.00      1.00      1.00       346





### 80/20

In [19]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd

# Separate features and target
X = Car.drop("car_acceptability", axis=1)  # Features
y = Car["car_acceptability"]               # Target

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced', C = 1000, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")




--- Trial 1 ---
Accuracy: 65.61%
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.60      0.71       239
           1       0.47      0.78      0.58       107

    accuracy                           0.66       346
   macro avg       0.66      0.69      0.65       346
weighted avg       0.74      0.66      0.67       346


--- Trial 2 ---
Accuracy: 67.63%
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.64      0.73       239
           1       0.49      0.77      0.59       107

    accuracy                           0.68       346
   macro avg       0.67      0.70      0.66       346
weighted avg       0.74      0.68      0.69       346


--- Trial 3 ---
Accuracy: 76.30%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.75      0.81       239
           1       0.59      0.79      0.67       107

    accurac

### 50/50

In [22]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd

# Separate features and target
X = Car.drop("car_acceptability", axis=1)  # Features
y = Car["car_acceptability"]               # Target

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced', C = 1000, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---
Accuracy: 68.17%
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.64      0.74       595
           1       0.49      0.77      0.60       269

    accuracy                           0.68       864
   macro avg       0.68      0.71      0.67       864
weighted avg       0.75      0.68      0.69       864


--- Trial 2 ---
Accuracy: 67.48%
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.65      0.74       605
           1       0.47      0.74      0.58       259

    accuracy                           0.67       864
   macro avg       0.66      0.69      0.66       864
weighted avg       0.74      0.67      0.69       864


--- Trial 3 ---
Accuracy: 71.53%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.69      0.77       594
           1       0.53      0.78      0.63       270

    accurac

### 20/80

In [23]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd

# Separate features and target
X = Car.drop("car_acceptability", axis=1)  # Features
y = Car["car_acceptability"]               # Target

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced',C = 1000, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")




--- Trial 1 ---
Accuracy: 68.52%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.64      0.74       956
           1       0.49      0.78      0.60       426

    accuracy                           0.69      1382
   macro avg       0.68      0.71      0.67      1382
weighted avg       0.75      0.69      0.70      1382


--- Trial 2 ---
Accuracy: 67.95%
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.66      0.74       975
           1       0.47      0.73      0.57       407

    accuracy                           0.68      1382
   macro avg       0.66      0.69      0.66      1382
weighted avg       0.74      0.68      0.69      1382


--- Trial 3 ---
Accuracy: 68.45%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.65      0.74       963
           1       0.49      0.77      0.60       419

    accurac

## ANN

#### Hyperparam Tuning

In [24]:
from sklearn.model_selection import ParameterGrid
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define hyperparameter grid
param_grid = {
    "lr": [0.001, 0.01, 0.1],                # Learning rates to tune
    "hidden_size_1": [32, 64, 128],          # Hidden layer 1 sizes to tune
    "hidden_size_2": [16, 32, 64],           # Hidden layer 2 sizes to tune
}

# Convert the grid to a list of parameter combinations
grid = list(ParameterGrid(param_grid))

# To store results
tuning_results = []

# Iterate over all combinations of hyperparameters
for params in grid:
    print(f"\nTesting Parameters: {params}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Define model with hyperparameters
    class ANN(nn.Module):
        def __init__(self, input_size):
            super(ANN, self).__init__()
            self.fc1 = nn.Linear(input_size, params["hidden_size_1"])
            self.fc2 = nn.Linear(params["hidden_size_1"], params["hidden_size_2"])
            self.fc3 = nn.Linear(params["hidden_size_2"], 2)  # 2 output classes

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params["lr"])

    # Training loop
    epochs = 50  # Fixed number of epochs
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()

    # Append results
    tuning_results.append({
        "Params": params,
        "Accuracy": accuracy
    })

# Convert results to DataFrame and display the best parameters
results_df = pd.DataFrame(tuning_results)
best_result = results_df.loc[results_df["Accuracy"].idxmax()]
print("\nBest Parameters and Results:")
print(best_result)

best_params = best_result["Params"]
best_accuracy = best_result["Accuracy"]

print("\nBest Parameters:")
print(f"Hidden Layer 1 Size: {best_params['hidden_size_1']}")
print(f"Hidden Layer 2 Size: {best_params['hidden_size_2']}")
print(f"Learning Rate: {best_params['lr']}")
print(f"\nBest Accuracy: {best_accuracy * 100:.2f}%")




Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 32, 'lr': 0.001}

Testing Pa

### 80/20

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.4656
Epoch [20/100], Loss: 0.3229
Epoch [30/100], Loss: 0.1801
Epoch [40/100], Loss: 0.0362
Epoch [50/100], Loss: 0.0104
Epoch [60/100], Loss: 0.0024
Epoch [70/100], Loss: 0.0008
Epoch [80/100], Loss: 0.0004
Epoch [90/100], Loss: 0.0002
Epoch [100/100], Loss: 0.0002
Accuracy: 0.9942

--- Trial 2 ---
Epoch [10/100], Loss: 0.4824
Epoch [20/100], Loss: 0.4126
Epoch [30/100], Loss: 0.2605
Epoch [40/100], Loss: 0.1205
Epoch [50/100], Loss: 0.0592
Epoch [60/100], Loss: 0.0496
Epoch [70/100], Loss: 0.0344
Epoch [80/100], Loss: 0.0279
Epoch [90/100], Loss: 0.0202
Epoch [100/100], Loss: 0.0161
Accuracy: 0.9769

--- Trial 3 ---
Epoch [10/100], Loss: 0.5566
Epoch [20/100], Loss: 0.2002
Epoch [30/100], Loss: 0.0552
Epoch [40/100], Loss: 0.0161
Epoch [50/100], Loss: 0.0055
Epoch [60/100], Loss: 0.0017
Epoch [70/100], Loss: 0.0007
Epoch [80/100], Loss: 0.0004
Epoch [90/100], Loss: 0.0003
Epoch [100/100], Loss: 0.0002
Accuracy: 0.9942

Summary of Results Acros

### 50/50

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.4603
Epoch [20/100], Loss: 0.1849
Epoch [30/100], Loss: 0.0884
Epoch [40/100], Loss: 0.0313
Epoch [50/100], Loss: 0.0149
Epoch [60/100], Loss: 0.0094
Epoch [70/100], Loss: 0.0068
Epoch [80/100], Loss: 0.0054
Epoch [90/100], Loss: 0.0045
Epoch [100/100], Loss: 0.0038
Accuracy: 0.9792

--- Trial 2 ---
Epoch [10/100], Loss: 0.5024
Epoch [20/100], Loss: 0.3845
Epoch [30/100], Loss: 0.1858
Epoch [40/100], Loss: 0.1237
Epoch [50/100], Loss: 0.0706
Epoch [60/100], Loss: 0.0438
Epoch [70/100], Loss: 0.0335
Epoch [80/100], Loss: 0.0228
Epoch [90/100], Loss: 0.0209
Epoch [100/100], Loss: 0.0201
Accuracy: 0.9850

--- Trial 3 ---
Epoch [10/100], Loss: 0.6171
Epoch [20/100], Loss: 0.5510
Epoch [30/100], Loss: 0.2765
Epoch [40/100], Loss: 0.1469
Epoch [50/100], Loss: 0.0641
Epoch [60/100], Loss: 0.0347
Epoch [70/100], Loss: 0.0270
Epoch [80/100], Loss: 0.0253
Epoch [90/100], Loss: 0.0246
Epoch [100/100], Loss: 0.0243
Accuracy: 0.9676

Summary of Results Acros

### 20/80

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.8, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.4860
Epoch [20/100], Loss: 0.3410
Epoch [30/100], Loss: 0.2686
Epoch [40/100], Loss: 0.1746
Epoch [50/100], Loss: 0.0912
Epoch [60/100], Loss: 0.0527
Epoch [70/100], Loss: 0.0367
Epoch [80/100], Loss: 0.0183
Epoch [90/100], Loss: 0.0086
Epoch [100/100], Loss: 0.0063
Accuracy: 0.9486

--- Trial 2 ---
Epoch [10/100], Loss: 0.4126
Epoch [20/100], Loss: 0.1129
Epoch [30/100], Loss: 0.0059
Epoch [40/100], Loss: 0.0007
Epoch [50/100], Loss: 0.0002
Epoch [60/100], Loss: 0.0001
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Accuracy: 0.9551

--- Trial 3 ---
Epoch [10/100], Loss: 0.3911
Epoch [20/100], Loss: 0.0985
Epoch [30/100], Loss: 0.0476
Epoch [40/100], Loss: 0.0434
Epoch [50/100], Loss: 0.0155
Epoch [60/100], Loss: 0.0129
Epoch [70/100], Loss: 0.0077
Epoch [80/100], Loss: 0.0021
Epoch [90/100], Loss: 0.0007
Epoch [100/100], Loss: 0.0001
Accuracy: 0.9284

Summary of Results Acros

## XG Boost

### Hyperparam Tuning Using Normal Looping

In [37]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.metrics import roc_auc_score


In [38]:
best_params = None
best_auc = 0
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

for n_estimators in [100, 200, 300]:
    for max_depth in [3, 5, 7]:
        for learning_rate in [0.01, 0.05, 0.1]:
            xgb = XGBClassifier(
                scale_pos_weight=scale_pos_weight,
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                random_state=42,
                eval_metric='logloss'
            )
            xgb.fit(X_train, y_train)
            y_proba = xgb.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            if auc > best_auc:
                best_auc = auc
                best_params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'learning_rate': learning_rate
                }
print(f"Best Parameters: {best_params}")


Best Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}


### 80/20

In [39]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assume 'Bank' is your DataFrame with the target column 'y'
# Separate features and target
X = Car.drop("car_acceptability", axis=1)
y = Car["car_acceptability"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 99.71%
ROC-AUC Score: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       239
           1       0.99      1.00      1.00       107

    accuracy                           1.00       346
   macro avg       1.00      1.00      1.00       346
weighted avg       1.00      1.00      1.00       346


--- Trial 2 ---
Accuracy: 95.66%
ROC-AUC Score: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       239
           1       0.88      1.00      0.93       107

    accuracy                           0.96       346
   macro avg       0.94      0.97      0.95       346
weighted avg       0.96      0.96      0.96       346


--- Trial 3 ---
Accuracy: 98.27%
ROC-AUC Score: 1.00
Classification Report:


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       1.00      0.97      0.99       239
           1       0.95      1.00      0.97       107

    accuracy                           0.98       346
   macro avg       0.97      0.99      0.98       346
weighted avg       0.98      0.98      0.98       346


Summary of Results Across Trials:
   Trial  Accuracy   ROC-AUC  Precision (Class 1)  Recall (Class 1)  \
0      1  0.997110  1.000000             0.990741               1.0   
1      2  0.956647  0.998319             0.877049               1.0   
2      3  0.982659  0.999179             0.946903               1.0   

   F1-Score (Class 1)  
0            0.995349  
1            0.934498  
2            0.972727  

Average Accuracy across trials: 97.88%
Average ROC-AUC across trials: 1.00


### 50/50

In [40]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assume 'Bank' is your DataFrame with the target column 'y'
# Separate features and target
X = Car.drop("car_acceptability", axis=1)
y = Car["car_acceptability"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.5, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 98.26%
ROC-AUC Score: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       595
           1       0.95      1.00      0.97       269

    accuracy                           0.98       864
   macro avg       0.97      0.99      0.98       864
weighted avg       0.98      0.98      0.98       864


--- Trial 2 ---
Accuracy: 97.22%
ROC-AUC Score: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       605
           1       0.92      1.00      0.96       259

    accuracy                           0.97       864
   macro avg       0.96      0.98      0.97       864
weighted avg       0.97      0.97      0.97       864


--- Trial 3 ---
Accuracy: 97.34%
ROC-AUC Score: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       594
      

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



### 20/80

In [41]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assume 'Bank' is your DataFrame with the target column 'y'
# Separate features and target
X = Car.drop("car_acceptability", axis=1)
y = Car["car_acceptability"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.8, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 95.44%
ROC-AUC Score: 0.99
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       956
           1       0.89      0.97      0.93       426

    accuracy                           0.95      1382
   macro avg       0.94      0.96      0.95      1382
weighted avg       0.96      0.95      0.95      1382


--- Trial 2 ---
Accuracy: 95.51%
ROC-AUC Score: 0.99
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.94      0.97       975
           1       0.88      0.98      0.93       407

    accuracy                           0.96      1382
   macro avg       0.94      0.96      0.95      1382
weighted avg       0.96      0.96      0.96      1382


--- Trial 3 ---
Accuracy: 95.88%
ROC-AUC Score: 0.99
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       963
      

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

