In [42]:
pip install xgboost





In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [5]:
df=pd.read_csv(r"C:\Users\DELL\OneDrive\Desktop\anu course\datasets\bmarketing.csv")

In [6]:
df['y']=df['y'].map({'yes':1,'no':0})

In [7]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int64
dtype: object

In [11]:
x=df.drop('y',axis=1)
y=df['y']

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [15]:
numerical_features = x.select_dtypes(include=['int64','float64']).columns.tolist()
print(numerical_features)

['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']


In [17]:
categorical_features = x.select_dtypes(include=['object']).columns.tolist()
print(categorical_features)

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']


In [18]:
numerical_cols = Pipeline(
    steps=[("Simple Imputer",SimpleImputer(strategy='mean')),
           ("Scaling",StandardScaler())]
)

In [21]:
categorical_cols = Pipeline(
    steps=[("Simple Imputer",SimpleImputer(strategy='most_frequent')),
           ("ohe",OneHotEncoder(handle_unknown='ignore'))]
)

In [23]:
preprocessing = ColumnTransformer(
    transformers= [("numerical",numerical_cols,numerical_features),
                   ("categorical",categorical_cols,categorical_features)]
)

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import is_classifier

import pickle
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


with open('sup_class_models', 'rb') as f:
    sup_class_models = pickle.load(f)

# Define hyperparameter grids for each model
param_grids = {
    "Decision Tree": {
        "classifier__max_depth": [None, 5, 10, 20],
        "classifier__min_samples_split": [2, 5, 10],
    },
    "Random Forest": {
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [None, 5, 10],
    },
    "K Nearest Neighbor": {
        "classifier__n_neighbors": [3, 5, 7, 9],
    },
    "MLP Classifier": {
        "classifier__hidden_layer_sizes": [(50,), (100,)],
        "classifier__alpha": [0.0001, 0.001],
        "classifier__max_iter": [200],
    },
    "XG_Boost": {
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [3, 5],
        "classifier__learning_rate": [0.05, 0.1],
    },
    "Naive Bayes": {
        # usually no important hyperparameters here, can leave empty or skip
    }
}

best_models = {}
best_scores = {}

for model_name, base_model in sup_class_models.items(): 
     #model_name = logistic Regression,Decision tree classifier,etc
    #base_model = logisticRegression(),DecisionTreeClassifier(),etc 

    
    # skip regression models like LinearRegression
    if not is_classifier(base_model):
        print(f"{model_name}: skipped (regression model)")
        continue

    if model_name not in param_grids:
        print(f"{model_name}: no param_grid defined, skipped")
        continue

    print(f"\n Tuning {model_name} with GridSearchCV...")

    pipe = Pipeline(steps=[
        ("preprocessor", preprocessing),
        ("classifier", base_model),
    ])

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grids[model_name],
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1
    )

    grid.fit(x_train, y_train)

    best_models[model_name] = grid.best_estimator_
    best_scores[model_name] = grid.best_score_

    print("  Best params:", grid.best_params_)
    print("  Best CV accuracy:", grid.best_score_)

# Compare best scores
print("\n Best models by CV accuracy:")
for name, score in best_scores.items():
    print(f"{name}: {score*100:.2f}%")


best_model_name = max(best_scores, key=best_scores.get)
final_model = best_models[best_model_name]


y_pred = final_model.predict(x_test)
print(f"\n Best model: {best_model_name}")
print("Test accuracy:", accuracy_score(y_test, y_pred)*100, "%")


Logistic Regression: no param_grid defined, skipped

 Tuning Decision Tree with GridSearchCV...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
  Best params: {'classifier__max_depth': 5, 'classifier__min_samples_split': 2}
  Best CV accuracy: 0.914477260373256

 Tuning Random Forest with GridSearchCV...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
  Best params: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
  Best CV accuracy: 0.9129190070664975
svm_rbf: no param_grid defined, skipped

 Tuning K Nearest Neighbor with GridSearchCV...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
  Best params: {'classifier__n_neighbors': 9}
  Best CV accuracy: 0.9036419641239354

 Tuning MLP Classifier with GridSearchCV...
Fitting 5 folds for each of 4 candidates, totalling 20 fits




  Best params: {'classifier__alpha': 0.0001, 'classifier__hidden_layer_sizes': (50,), 'classifier__max_iter': 200}
  Best CV accuracy: 0.904439210001812

 Tuning XG_Boost with GridSearchCV...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
  Best params: {'classifier__learning_rate': 0.05, 'classifier__max_depth': 5, 'classifier__n_estimators': 100}
  Best CV accuracy: 0.9173400978438122

 Tuning Naive Bayes with GridSearchCV...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
  Best params: {}
  Best CV accuracy: 0.7881500271788366

 Best models by CV accuracy:
Decision Tree: 91.45%
Random Forest: 91.29%
K Nearest Neighbor: 90.36%
MLP Classifier: 90.44%
XG_Boost: 91.73%
Naive Bayes: 78.82%

 Best model: XG_Boost
Test accuracy: 91.79724858382991 %


In [27]:
lor_pipeline = Pipeline(
    steps = [("preprocessing",preprocessing),
             ("model",LogisticRegression())]
)

In [29]:
lor_pipeline

In [31]:
svr_pipeline = Pipeline(
    steps = [("preprocessing",preprocessing),
            ("model",SVC())]
)

In [32]:
svr_pipeline

In [35]:
mlp_pipeline = Pipeline(
    steps = [("preprocessing",preprocessing),
            ("model",MLPClassifier())]
)

In [37]:
mlp_pipeline

In [39]:
knn_pipeline = Pipeline(
    steps = [("preprocessing",preprocessing),
            ("model",KNeighborsClassifier())]
)

In [41]:
knn_pipeline

In [43]:
xgb_pipeline = Pipeline(
    steps = [("preprocessing",preprocessing),
            ("model",XGBClassifier())]
)

In [45]:
xgb_pipeline

In [47]:
models = {
    "logistic regression":lor_pipeline,
    "Support Vector Machine":svr_pipeline,
    "MultiLayer Perceptron":mlp_pipeline,
    "K Nearest Neighbors":knn_pipeline,
    "XG boost":xgb_pipeline
}

In [49]:
results={}

In [51]:
for name, model in models.items():   
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: {acc*100:.2f}%")

logistic regression: 91.05%
Support Vector Machine: 91.15%




MultiLayer Perceptron: 89.32%
K Nearest Neighbors: 90.02%
XG boost: 91.36%
