In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

In [3]:
# 1. Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
column_names = [f'feature_{i}' for i in range(57)] + ['target']
df = pd.read_csv(url, header=None, names=column_names)

In [6]:
print(column_names)
#print(df.head(10))

['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'target']


In [7]:
print(df.head(10))

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0       0.00       0.64       0.64        0.0       0.32       0.00   
1       0.21       0.28       0.50        0.0       0.14       0.28   
2       0.06       0.00       0.71        0.0       1.23       0.19   
3       0.00       0.00       0.00        0.0       0.63       0.00   
4       0.00       0.00       0.00        0.0       0.63       0.00   
5       0.00       0.00       0.00        0.0       1.85       0.00   
6       0.00       0.00       0.00        0.0       1.92       0.00   
7       0.00       0.00       0.00        0.0       1.88       0.00   
8       0.15       0.00       0.46        0.0       0.61       0.00   
9       0.06       0.12       0.77        0.0       0.19       0.32   

   feature_6  feature_7  feature_8  feature_9  ...  feature_48  feature_49  \
0       0.00       0.00       0.00       0.00  ...        0.00       0.000   
1       0.21       0.07       0.00       0.94  ...        0.00

In [8]:
# 2. Features and target
x = df.drop('target', axis=1)
y = df['target']
print(x)
print(y)

      feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0          0.00       0.64       0.64        0.0       0.32       0.00   
1          0.21       0.28       0.50        0.0       0.14       0.28   
2          0.06       0.00       0.71        0.0       1.23       0.19   
3          0.00       0.00       0.00        0.0       0.63       0.00   
4          0.00       0.00       0.00        0.0       0.63       0.00   
...         ...        ...        ...        ...        ...        ...   
4596       0.31       0.00       0.62        0.0       0.00       0.31   
4597       0.00       0.00       0.00        0.0       0.00       0.00   
4598       0.30       0.00       0.30        0.0       0.00       0.00   
4599       0.96       0.00       0.00        0.0       0.32       0.00   
4600       0.00       0.00       0.65        0.0       0.00       0.00   

      feature_6  feature_7  feature_8  feature_9  ...  feature_47  feature_48  \
0          0.00       0.00    

In [9]:
# 3. Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')
x_imputed = imputer.fit_transform(x)


In [10]:
x_imputed

array([[0.000e+00, 6.400e-01, 6.400e-01, ..., 3.756e+00, 6.100e+01,
        2.780e+02],
       [2.100e-01, 2.800e-01, 5.000e-01, ..., 5.114e+00, 1.010e+02,
        1.028e+03],
       [6.000e-02, 0.000e+00, 7.100e-01, ..., 9.821e+00, 4.850e+02,
        2.259e+03],
       ...,
       [3.000e-01, 0.000e+00, 3.000e-01, ..., 1.404e+00, 6.000e+00,
        1.180e+02],
       [9.600e-01, 0.000e+00, 0.000e+00, ..., 1.147e+00, 5.000e+00,
        7.800e+01],
       [0.000e+00, 0.000e+00, 6.500e-01, ..., 1.250e+00, 5.000e+00,
        4.000e+01]])

In [11]:
# 4. Feature scaling
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_imputed)

In [12]:
x_scaled

array([[-3.42433707e-01,  3.30884903e-01,  7.12858774e-01, ...,
        -4.52472762e-02,  4.52979198e-02, -8.72413388e-03],
       [ 3.45359395e-01,  5.19091945e-02,  4.35129540e-01, ...,
        -2.44326749e-03,  2.50562832e-01,  1.22832407e+00],
       [-1.45921392e-01, -1.65071912e-01,  8.51723390e-01, ...,
         1.45920848e-01,  2.22110599e+00,  3.25873251e+00],
       ...,
       [ 6.40127868e-01, -1.65071912e-01,  3.83734930e-02, ...,
        -1.19382054e-01, -2.36941335e-01, -2.72627750e-01],
       [ 2.80176333e+00, -1.65071912e-01, -5.56760578e-01, ...,
        -1.27482666e-01, -2.42072958e-01, -3.38603654e-01],
       [-3.42433707e-01, -1.65071912e-01,  7.32696576e-01, ...,
        -1.24236117e-01, -2.42072958e-01, -4.01280763e-01]])

In [13]:
# 5. Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    x_scaled, y, test_size=0.2, random_state=42
)

In [15]:
# 6. Train XGBoost Classifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(x_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [17]:
# 7. Predictions
y_pred = model.predict(x_test)

In [19]:
# 8. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred)*100)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 95.87404994571118

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       531
           1       0.97      0.94      0.95       390

    accuracy                           0.96       921
   macro avg       0.96      0.96      0.96       921
weighted avg       0.96      0.96      0.96       921

