# Email Spam Detection 

In this perticular project we will use different approaches for determining which approach works best for prediction

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Let's load the dataset and get familier with the data

In [3]:
df = pd.read_csv("data/emails.csv")

In [4]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

There are 5172 rows and 3002 columns


In [5]:
df.head(2)

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0


In [6]:
df.tail(2)

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1
5171,Email 5172,22,24,5,1,6,5,148,8,2,...,0,0,0,0,0,0,0,0,0,0


From above we get a idea that the emails are classifiec as spam or not spam by viewing the frequency of most common words in the dataset

In [7]:
df.isna().values.any()

False

There are no null values in the dataset

In [8]:
df.duplicated().values.any()

False

There are no duplicates in the dataset

## Sklearn Approach

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
X = df.drop(["Email No.", "Prediction"], axis=1)
y = df.Prediction

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [12]:
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [13]:
lreg = LogisticRegression(verbose=1, max_iter=10000)

In [14]:
lreg.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.9s finished


In [15]:
pred = lreg.predict(X_train)

In [16]:
lreg.score(X_test, y_test)

0.9632850241545894

Without using the normalization the accuracy score for testing data is 96.32 %

## Let's try the same for normalized data

In [17]:
mean = X.mean()
std = X.std()
Z = (X - mean) / std

In [18]:
X_norm_train, X_norm_test, y_norm_train, y_norm_test = train_test_split(Z, y, test_size=0.4, random_state=42)

In [19]:
X_norm_valid, X_norm_test, y_norm_vlid, y_norm_test = train_test_split(X_norm_test, y_norm_test, test_size=0.5, random_state=42)

In [20]:
l_norm_reg = LogisticRegression(max_iter=1000).fit(X_norm_train, y_norm_train)

In [21]:
l_norm_reg.score(X_norm_test, y_norm_test)

0.9623188405797102

There is not much of a difference

## LGBMClassifier Approach

In [22]:
from lightgbm import LGBMClassifier



In [23]:
from optuna.samplers import TPESampler
import optuna

In [24]:
X_norm_test.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
3525,-0.309967,-0.334375,-0.293866,-0.343303,-0.453947,-0.261167,-0.314218,-0.339916,-0.003496,-0.342298,...,-0.029187,-0.04752,-0.062938,-0.091129,-0.17212,-0.044193,-0.047325,-0.05628,-0.329017,-0.070964
2552,-0.224824,-0.439257,-0.293866,-0.343303,-0.240296,-0.421685,-0.474083,-0.339916,-0.003496,-0.446022,...,-0.029187,-0.04752,-0.062938,-0.091129,-0.17212,-0.044193,-0.047325,-0.05628,-0.329017,-0.070964
2576,0.626601,0.609557,-0.010201,-0.177904,1.468915,0.220386,0.679225,0.587202,-0.290528,-0.031125,...,-0.029187,-0.04752,-0.062938,-0.091129,-0.17212,-0.044193,-0.047325,-0.05628,0.750041,-0.070964
1684,0.030603,-0.439257,-0.22295,-0.508702,-0.453947,0.059868,-0.348475,0.123643,-0.147012,-0.342298,...,-0.029187,-0.04752,-0.062938,-0.091129,-0.17212,-0.044193,-0.047325,-0.05628,0.030669,-0.070964
4238,-0.565395,-0.64902,-0.293866,-0.508702,-0.667599,-0.421685,-0.576853,-0.339916,-0.290528,-0.497884,...,-0.029187,-0.04752,-0.062938,-0.091129,-0.17212,-0.044193,-0.047325,-0.05628,-0.329017,-0.070964


In [25]:
# def objective(trial, X_train, y_train, X_test, y_test):
#     # Define parameters to be optimized for the LGBMClassifier
#     param = {
#         "objective": "cross_entropy",
#         "metric": "logloss",
#         "verbosity": -1,
#         "boosting_type": "gbdt",
#         "random_state": 42,
#         "num_class": 1,
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
#         "n_estimators": trial.suggest_int("n_estimators", 400, 600),
#         "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.015),
#         "lambda_l2": trial.suggest_float("lambda_l2", 0.02, 0.06),
#         "max_depth": trial.suggest_int("max_depth", 6, 14),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
#         "subsample": trial.suggest_float("subsample", 0.8, 1.0),
#         "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
#     }

#     # Create an instance of LGBMClassifier with the suggested parameters
#     lgbm_classifier = LGBMClassifier(**param)
    
#     # Fit the classifier on the training data
#     lgbm_classifier.fit(X_train, y_train)

#     # Evaluate the classifier on the test data
#     score = lgbm_classifier.score(X_test, y_test)

#     return score

# # Split the data into training and test sets
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust the test_size as needed

# # Set up the sampler for Optuna optimization
# sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# # Create a study object for Optuna optimization
# study = optuna.create_study(direction="maximize", sampler=sampler)

# # Run the optimization process
# study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=50)

# # Get the best parameters after optimization
# best_params = study.best_params

# print('='*50)
# print(best_params)

In [26]:
best_params = {
    "objective": "cross_entropy",
    "metric": "logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "random_state": 42,
    "num_class": 1,
    'learning_rate': 0.035368515574774204, 'n_estimators': 440, 'lambda_l1': 0.014100400375907813, 'lambda_l2': 0.03119201863456424, 'max_depth': 11, 'colsample_bytree': 0.592848613334175, 'subsample': 0.9463995426676174, 'min_child_samples': 44
}

In [27]:
lgbm_classifier = LGBMClassifier(**best_params)
lgbm_classifier.fit(X_train, y_train)
y_pred = lgbm_classifier.predict(X_test)
accuracy_score(y_pred, y_test)

0.9729468599033816

### With LGBMClassifier after tuning some hyperparameters we get a whooping score of 98.11 % for testing dataset

## XGBoost Approach

### XGBoost Without Optimization

In [56]:
import xgboost as xgb

In [29]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)
dtest = xgb.DMatrix(X_test, y_test)

In [30]:
param = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "smaple_type": "uniform",
    "normalize_type": "tree",
    "tree_method": "hist",
    # "num_class": 1,
    # "rate_drop": 0.1,
    # "skip_drop": 0.5,
    # "monotone_constraints": (-1, 1),
}

num_rounds = 100
bst = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, "train"), (dvalid, "valid")], early_stopping_rounds=10, verbose_eval=5)

[0]	train-logloss:0.44110	valid-logloss:0.45209


Parameters: { "normalize_type", "smaple_type" } are not used.



[5]	train-logloss:0.19708	valid-logloss:0.21688
[10]	train-logloss:0.12292	valid-logloss:0.14624
[15]	train-logloss:0.08399	valid-logloss:0.11489
[20]	train-logloss:0.06412	valid-logloss:0.09818
[25]	train-logloss:0.04759	valid-logloss:0.08578
[30]	train-logloss:0.03938	valid-logloss:0.08065
[35]	train-logloss:0.03180	valid-logloss:0.07649
[40]	train-logloss:0.02473	valid-logloss:0.07148
[45]	train-logloss:0.02029	valid-logloss:0.06901
[50]	train-logloss:0.01639	valid-logloss:0.06600
[55]	train-logloss:0.01389	valid-logloss:0.06515
[60]	train-logloss:0.01203	valid-logloss:0.06502
[65]	train-logloss:0.01017	valid-logloss:0.06397
[70]	train-logloss:0.00881	valid-logloss:0.06451
[75]	train-logloss:0.00795	valid-logloss:0.06416
[76]	train-logloss:0.00778	valid-logloss:0.06426


In [31]:
pred = bst.predict(dtest)

In [32]:
accuracy_score(pred > 0.5, y_test)

0.9719806763285024

### Hyperparameter tuning for xgboost

In [33]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

In [34]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 40,
        'seed': 0
    }

In [35]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_valid, y_valid)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    print(pred)
    accuracy = accuracy_score(y_test, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [36]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials)

  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
  2%|▉                                               | 1/50 [00:01<01:05,  1.33s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
  4%|█▉                                              | 2/50 [00:02<01:05,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
  6%|██▉                                             | 3/50 [00:04<01:04,  1.37s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
  8%|███▊                                            | 4/50 [00:05<01:03,  1.37s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 10%|████▊                                           | 5/50 [00:06<01:02,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 12%|█████▊                                          | 6/50 [00:08<01:01,  1.39s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 14%|██████▋                                         | 7/50 [00:09<00:59,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 16%|███████▋                                        | 8/50 [00:10<00:57,  1.37s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 18%|████████▋                                       | 9/50 [00:12<00:56,  1.37s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 20%|█████████▍                                     | 10/50 [00:13<00:54,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 22%|██████████▎                                    | 11/50 [00:15<00:52,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 24%|███████████▎                                   | 12/50 [00:16<00:51,  1.35s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 26%|████████████▏                                  | 13/50 [00:17<00:50,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 28%|█████████████▏                                 | 14/50 [00:19<00:48,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 30%|██████████████                                 | 15/50 [00:20<00:47,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 32%|███████████████                                | 16/50 [00:21<00:46,  1.37s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 34%|███████████████▉                               | 17/50 [00:23<00:44,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 36%|████████████████▉                              | 18/50 [00:24<00:43,  1.35s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 38%|█████████████████▊                             | 19/50 [00:25<00:42,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 40%|██████████████████▊                            | 20/50 [00:27<00:40,  1.36s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 42%|███████████████████▋                           | 21/50 [00:28<00:39,  1.37s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 44%|████████████████████▋                          | 22/50 [00:30<00:38,  1.37s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 46%|█████████████████████▌                         | 23/50 [00:31<00:37,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 48%|██████████████████████▌                        | 24/50 [00:32<00:35,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 50%|███████████████████████▌                       | 25/50 [00:34<00:34,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 52%|████████████████████████▍                      | 26/50 [00:35<00:33,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 54%|█████████████████████████▍                     | 27/50 [00:36<00:31,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 56%|██████████████████████████▎                    | 28/50 [00:38<00:30,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 58%|███████████████████████████▎                   | 29/50 [00:39<00:28,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 60%|████████████████████████████▏                  | 30/50 [00:41<00:27,  1.39s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 62%|█████████████████████████████▏                 | 31/50 [00:42<00:26,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 64%|██████████████████████████████                 | 32/50 [00:43<00:24,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 66%|███████████████████████████████                | 33/50 [00:45<00:23,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 68%|███████████████████████████████▉               | 34/50 [00:46<00:22,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 70%|████████████████████████████████▉              | 35/50 [00:48<00:20,  1.38s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 72%|█████████████████████████████████▊             | 36/50 [00:49<00:19,  1.40s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 74%|██████████████████████████████████▊            | 37/50 [00:50<00:18,  1.40s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 76%|███████████████████████████████████▋           | 38/50 [00:52<00:16,  1.40s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 78%|████████████████████████████████████▋          | 39/50 [00:53<00:15,  1.41s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 80%|█████████████████████████████████████▌         | 40/50 [00:55<00:14,  1.41s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 82%|██████████████████████████████████████▌        | 41/50 [00:56<00:12,  1.40s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 84%|███████████████████████████████████████▍       | 42/50 [00:57<00:11,  1.39s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 86%|████████████████████████████████████████▍      | 43/50 [00:59<00:09,  1.39s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 88%|█████████████████████████████████████████▎     | 44/50 [01:00<00:08,  1.39s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 90%|██████████████████████████████████████████▎    | 45/50 [01:01<00:06,  1.39s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 92%|███████████████████████████████████████████▏   | 46/50 [01:03<00:05,  1.40s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 94%|████████████████████████████████████████████▏  | 47/50 [01:04<00:04,  1.39s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 96%|█████████████████████████████████████████████  | 48/50 [01:06<00:02,  1.41s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
 98%|██████████████████████████████████████████████ | 49/50 [01:07<00:01,  1.40s/trial, best loss: -0.7053140096618358]





[0 0 0 ... 0 0 0]                                                                                                      
SCORE:                                                                                                                 
0.7053140096618358                                                                                                     
100%|███████████████████████████████████████████████| 50/50 [01:09<00:00,  1.38s/trial, best loss: -0.7053140096618358]


In [37]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.8008924215918807, 'gamma': 2.563221813792972, 'max_depth': 4.0, 'min_child_weight': 7.0, 'reg_alpha': 85.0, 'reg_lambda': 0.8738926214057564}


In [41]:
clf=xgb.XGBClassifier(
                    n_estimators =40, max_depth = 4, gamma = 2.563221813792972,
                    reg_alpha = 85,min_child_weight=7,
                    colsample_bytree=0.8008924215918807, reg_lambda=0.8738926214057564)

In [42]:
evaluation = [( X_train, y_train), ( X_valid, y_valid)]
clf.fit(X_train, y_train,
        eval_set=evaluation, eval_metric="auc",
        early_stopping_rounds=10,verbose=False)



In [45]:
pred = clf.predict(X_train)
print(pred)
accuracy = accuracy_score(y_train, pred)

[1 0 0 ... 0 1 1]


In [46]:
accuracy

0.8878504672897196

In [61]:
from hyperopt import hp, tpe, fmin, Trials, space_eval
from sklearn.model_selection import cross_val_score

# Define the search space for hyperparameters
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1.0)),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1.0)
}

# Define objective function to minimize (in this case, negative accuracy)
def objective(params):
    # Ensure integer parameters are cast appropriately
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])
    params['min_child_weight'] = int(params['min_child_weight'])
    
    clf = xgb.XGBClassifier(**params)
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
    return -score

# Perform Bayesian Optimization
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=50, trials=trials)

# Get the best parameters
best_params = space_eval(space, best)
print("Best parameters found:", best_params)


100%|███████████████████████████████████████████████| 50/50 [11:56<00:00, 14.33s/trial, best loss: -0.9700306477585581]
Best parameters found: {'colsample_bytree': 0.582496277503466, 'gamma': 0.03565320599595295, 'learning_rate': 0.1681465962028312, 'max_depth': 6.0, 'min_child_weight': 4.0, 'n_estimators': 144.0, 'reg_alpha': 0.5259197828627977, 'reg_lambda': 0.5644318616687096, 'subsample': 0.8588968222862713}


In [73]:
clf=xgb.XGBClassifier(
                    n_estimators =200, max_depth = 6, gamma = 0.03565320599595295,
                    learning_rate = 0.1681465962028312, subsample = 0.8588968222862713,
                    reg_alpha = 0.5259197828627977,min_child_weight=4,
                    colsample_bytree=0.582496277503466, reg_lambda=0.5259197828627977)

In [74]:
evaluation = [( X_train, y_train), ( X_valid, y_valid)]
clf.fit(X_train, y_train,
        eval_set=evaluation, eval_metric="auc",
        early_stopping_rounds=10,verbose=False)



In [75]:
pred = clf.predict(X_test)
print(pred)
accuracy = accuracy_score(y_test, pred)

[1 0 0 ... 1 1 1]


In [76]:
accuracy

0.970048309178744