In [19]:
# import necessary modules

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# prepare column transformers

def create_column_transformer(numeric_features, categorical_features):
    numeric_transformer = StandardScaler()

    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # To pass through any features not specified in the transformers
    )
    return preprocessor


# function to evaluate the models

def evaluate_classifiers(X, y, numeric_features, categorical_features):
    preprocessor = create_column_transformer(numeric_features, categorical_features)
    
    classifiers = {
        'Logistic Regression': LogisticRegression(),
        'Naive Bayes': GaussianNB(),
        'Support Vector Classifier': SVC(),
        'K-Nearest Neighbors': KNeighborsClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'AdaBoost': AdaBoostClassifier(),  
        'Gradient Boosting': GradientBoostingClassifier(),
        'XGBoost': XGBClassifier(),
    }
    
    results = {}
    
    for name, classifier in classifiers.items():
        pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote', SMOTE()),  # Handle class imbalance using SMOTE
            ('classifier', classifier)
        ])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        f1 = f1_score(y_test, y_pred, average='binary')
        results[name] = f1
        print(f'{name}: F1 Score = {f1:.4f}')
    
    return results

# main:
# load the datframe from the csv file

df = pd.read_csv('train.csv')



X = df.drop(columns=['y'])
y = df['y']
le = LabelEncoder()
y = le.fit_transform(y)
y =pd.Series(y)
# Specify which columns are numeric and which are categorical
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Evaluate all classifiers
results = evaluate_classifiers(X, y, numeric_features, categorical_features)

# Display results
print("\n------Final F1 Scores for All Classifiers:-------")
for model_name, f1 in results.items():
    print(f'{model_name}: {f1:.4f}')
print("-------"*10)

Logistic Regression: F1 Score = 0.5393
Naive Bayes: F1 Score = 0.4478
Support Vector Classifier: F1 Score = 0.5641
K-Nearest Neighbors: F1 Score = 0.5101
Decision Tree: F1 Score = 0.4690
Random Forest: F1 Score = 0.5576




AdaBoost: F1 Score = 0.5443
Gradient Boosting: F1 Score = 0.5810
XGBoost: F1 Score = 0.5671

------Final F1 Scores for All Classifiers:-------
Logistic Regression: 0.5393
Naive Bayes: 0.4478
Support Vector Classifier: 0.5641
K-Nearest Neighbors: 0.5101
Decision Tree: 0.4690
Random Forest: 0.5576
AdaBoost: 0.5443
Gradient Boosting: 0.5810
XGBoost: 0.5671
----------------------------------------------------------------------


In [23]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Step 1: Define the ColumnTransformer
def create_column_transformer(numeric_features, categorical_features):
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # To pass through any features not specified in the transformers
    )
    return preprocessor

# Step 2: Define Optimized Hyperparameter Grids
param_grid_gb = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 4],
    'classifier__min_samples_split': [5, 10],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__subsample': [0.8, 1.0]
}

param_grid_xgb = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 4],
    'classifier__min_child_weight': [1, 3],
    'classifier__gamma': [0, 0.1],
    'classifier__colsample_bytree': [0.8, 1.0]
}

# Step 3: Function to Tune Hyperparameters
def tune_hyperparameters(X, y, numeric_features, categorical_features, param_grid, model, model_name):
    preprocessor = create_column_transformer(numeric_features, categorical_features)
    
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE()),  # Handle class imbalance using SMOTE
        ('classifier', model)
    ])
    
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_grid,
        n_iter=20,  # Reduced number of iterations
        scoring=make_scorer(f1_score),
        cv=3,  # Reduced number of cross-validation folds
        verbose=2,
        random_state=42,
        n_jobs=-1  # Utilize all available CPU cores
    )
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    random_search.fit(X_train, y_train)
    
    best_params = random_search.best_params_  # Corrected: best_params_ (underscore)
    best_model = random_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='binary')
    
    print(f'Best F1 Score for {model_name}: {f1:.4f}')
    print(f'Best Parameters for {model_name}: {best_params}')
    
    return best_params, f1


# Step 4: Prepare the Data and Perform Hyperparameter Tuning
# main:
# load the datframe from the csv file

df = pd.read_csv('train.csv')



X = df.drop(columns=['y'])
y = df['y']
le = LabelEncoder()
y = le.fit_transform(y)
y =pd.Series(y)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Tune Gradient Boosting
best_params_gb, best_f1_gb = tune_hyperparameters(X, y, numeric_features, categorical_features, param_grid_gb, GradientBoostingClassifier(), "Gradient Boosting")

# Tune XGBoost
best_params_xgb, best_f1_xgb = tune_hyperparameters(X, y, numeric_features, categorical_features, param_grid_xgb, XGBClassifier(), "XGBoost")


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END classifier__learning_rate=0.1, classifier__max_depth=4, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=100, classifier__subsample=1.0; total time=  34.5s
[CV] END classifier__learning_rate=0.1, classifier__max_depth=4, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=100, classifier__subsample=1.0; total time=  34.9s
[CV] END classifier__learning_rate=0.1, classifier__max_depth=4, classifier__min_samples_leaf=1, classifier__min_samples_split=10, classifier__n_estimators=200, classifier__subsample=0.8; total time=  54.9s
[CV] END classifier__learning_rate=0.1, classifier__max_depth=4, classifier__min_samples_leaf=1, classifier__min_samples_split=10, classifier__n_estimators=200, classifier__subsample=0.8; total time=  55.3s
[CV] END classifier__learning_rate=0.1, classifier__max_depth=4, classifier__min_samples_leaf=1, classifier__min_

In [24]:
best_params_gb, best_f1_gb

({'classifier__subsample': 1.0,
  'classifier__n_estimators': 200,
  'classifier__min_samples_split': 10,
  'classifier__min_samples_leaf': 2,
  'classifier__max_depth': 3,
  'classifier__learning_rate': 0.1},
 np.float64(0.5968271334792122))

In [25]:
best_params_xgb, best_f1_xgb

({'classifier__n_estimators': 200,
  'classifier__min_child_weight': 1,
  'classifier__max_depth': 4,
  'classifier__learning_rate': 0.1,
  'classifier__gamma': 0.1,
  'classifier__colsample_bytree': 1.0},
 np.float64(0.5947786606129398))

In [28]:
import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Step 1: Define the ColumnTransformer
def create_column_transformer(numeric_features, categorical_features):
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # To pass through any features not specified in the transformers
    )
    return preprocessor

# Step 2: Define the best hyperparameters for the models
best_params_xgb = {
    'classifier__n_estimators': 200,
    'classifier__min_child_weight': 1,
    'classifier__max_depth': 4,
    'classifier__learning_rate': 0.1,
    'classifier__gamma': 0.1,
    'classifier__colsample_bytree': 1.0
}

best_params_gb = {
    'classifier__subsample': 1.0,
    'classifier__n_estimators': 200,
    'classifier__min_samples_split': 10,
    'classifier__min_samples_leaf': 2,
    'classifier__max_depth': 3,
    'classifier__learning_rate': 0.1
}

# Step 3: Prepare the data
# main:
# load the datframe from the csv file

df = pd.read_csv('train.csv')



X = df.drop(columns=['y'])
y = df['y']
le = LabelEncoder()
y = le.fit_transform(y)
y =pd.Series(y)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 4: Train and save the XGBoost model
preprocessor = create_column_transformer(numeric_features, categorical_features)

xgb_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE()),  # Handle class imbalance using SMOTE
    ('classifier', XGBClassifier(**{k.split('__')[1]: v for k, v in best_params_xgb.items()}))
])

xgb_pipeline.fit(X_train, y_train)
xgb_f1 = f1_score(y_test, xgb_pipeline.predict(X_test))
print(f"XGBoost F1 Score: {xgb_f1:.4f}")

# Save the XGBoost model
joblib.dump(xgb_pipeline, 'xgb_model.pkl')

# Step 5: Train and save the Gradient Boosting model
gb_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE()),  # Handle class imbalance using SMOTE
    ('classifier', GradientBoostingClassifier(**{k.split('__')[1]: v for k, v in best_params_gb.items()}))
])

gb_pipeline.fit(X_train, y_train)
gb_f1 = f1_score(y_test, gb_pipeline.predict(X_test))
print(f"Gradient Boosting F1 Score: {gb_f1:.4f}")

# Save the Gradient Boosting model
# joblib.dump(gb_pipeline, 'gb_model.pkl')


XGBoost F1 Score: 0.6016
Gradient Boosting F1 Score: 0.5990


In [31]:
X_test

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome
37735,40,blue-collar,divorced,secondary,cellular,14,may,449,1,unknown
44332,35,unemployed,single,tertiary,cellular,29,jul,200,4,failure
4432,38,self-employed,married,secondary,unknown,20,may,775,1,unknown
38725,35,blue-collar,married,secondary,cellular,15,may,1313,7,failure
38581,44,services,single,secondary,cellular,15,may,550,2,failure
...,...,...,...,...,...,...,...,...,...,...
15913,40,services,married,secondary,cellular,22,jul,87,3,unknown
20622,38,management,married,tertiary,cellular,12,aug,1092,5,unknown
41684,50,blue-collar,married,primary,telephone,2,oct,382,2,unknown
32749,38,technician,single,tertiary,cellular,17,apr,222,1,unknown


In [30]:
df.dur.describe()

count    45211.000000
mean       258.163080
std        257.527812
min          0.000000
25%        103.000000
50%        180.000000
75%        319.000000
max       4918.000000
Name: dur, dtype: float64

In [34]:
y.iloc[41684]

np.int64(0)

In [35]:
y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Length: 45211, dtype: int64