In [1]:
#One
# Import tools we need
from sklearn.model_selection import KFold  # Tool to split data into parts (folds) for testing
import numpy as np                         # Tool for working with numbers and lists (arrays)

# Create our data
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])  # A list of 10 numbers (our data)

# Set up the splitting
kf = KFold(n_splits=10)  # Tells the tool to split data into 5 equal parts (folds)

# Loop through each split
for train_index, test_index in kf.split(data):  # Splits data 5 times, gives us indexes for train and test
    print("Train:", data[train_index], "Test:", data[test_index])  # Shows training and testing parts

Train: [ 2  3  4  5  6  7  8  9 10] Test: [1]
Train: [ 1  3  4  5  6  7  8  9 10] Test: [2]
Train: [ 1  2  4  5  6  7  8  9 10] Test: [3]
Train: [ 1  2  3  5  6  7  8  9 10] Test: [4]
Train: [ 1  2  3  4  6  7  8  9 10] Test: [5]
Train: [ 1  2  3  4  5  7  8  9 10] Test: [6]
Train: [ 1  2  3  4  5  6  8  9 10] Test: [7]
Train: [ 1  2  3  4  5  6  7  9 10] Test: [8]
Train: [ 1  2  3  4  5  6  7  8 10] Test: [9]
Train: [1 2 3 4 5 6 7 8 9] Test: [10]


# Stratified K-Fold Cross Validation (Advanced Version)

In [2]:
# Import tools we need
from sklearn.model_selection import StratifiedKFold  # Tool to split data into balanced parts
import numpy as np                           # Tool for working with numbers and lists

# Create our data
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])      # A list of 10 numbers (our data)
labels = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])     # Labels: 0 = cat, 1 = dog (5 cats, 5 dogs)

# Set up the splitting
skf = StratifiedKFold(n_splits=5)  # Splits data into 5 parts, keeping cat/dog balance

# Loop through each split
for train_index, test_index in skf.split(data, labels):  # Splits data 5 times, using data and labels
    print("Train:", data[train_index], "Test:", data[test_index])  # Shows training and testing numbers
    print("Train labels:", labels[train_index], "Test labels:", labels[test_index])  # Shows training and testing labels

Train: [ 2  3  4  5  7  8  9 10] Test: [1 6]
Train labels: [0 0 0 0 1 1 1 1] Test labels: [0 1]
Train: [ 1  3  4  5  6  8  9 10] Test: [2 7]
Train labels: [0 0 0 0 1 1 1 1] Test labels: [0 1]
Train: [ 1  2  4  5  6  7  9 10] Test: [3 8]
Train labels: [0 0 0 0 1 1 1 1] Test labels: [0 1]
Train: [ 1  2  3  5  6  7  8 10] Test: [4 9]
Train labels: [0 0 0 0 1 1 1 1] Test labels: [0 1]
Train: [1 2 3 4 6 7 8 9] Test: [ 5 10]
Train labels: [0 0 0 0 1 1 1 1] Test labels: [0 1]


## xgboost

In [3]:
! pip install xgboost



In [4]:
# Step 1: Import libraries
from xgboost import XGBClassifier          # XGBoost model
from sklearn.model_selection import KFold  # For K-Fold splitting
from sklearn.metrics import accuracy_score # To check how good our guesses are
import numpy as np                         # For math and arrays

# Step 2: Create fake data (like a toy example)
# Features (X): 2 columns (e.g., toy size, toy price)
X = np.array([[1, 2], [2, 3], [3, 1], [4, 5], [5, 4], [2, 1], [1, 3], [3, 4], [4, 2], [5, 3]])
# Labels (y): 0 or 1 (e.g., 0 = no buy, 1 = buy)
y = np.array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1])

# Step 3: Set up K-Fold
k = 5  # Number of splits
kf = KFold(n_splits=k, shuffle=True, random_state=42)  # Split into 5 parts, mix data first

# Step 4: Create lists to store results
accuracies = []  # To save accuracy for each fold

# Step 5: Run K-Fold with XGBoost
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    # Split data into train and test for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Create XGBoost model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    
    # Train the model on training data
    model.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = model.predict(X_test)
    
    # Calculate accuracy (how many guesses were right)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    
    # Print result for this fold
    print(f"Fold {fold + 1}: Accuracy = {accuracy:.2f}")

# Step 6: Show average result
average_accuracy = np.mean(accuracies)
print(f"\nAverage Accuracy across {k} folds: {average_accuracy:.2f}")

Fold 1: Accuracy = 0.50
Fold 2: Accuracy = 0.00
Fold 3: Accuracy = 0.50
Fold 4: Accuracy = 0.00
Fold 5: Accuracy = 0.50

Average Accuracy across 5 folds: 0.30


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [5]:
# Import Libraries
import numpy as np
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Generating synthetic data & splitting into train-test
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [7]:
# XGBoost Classifier
clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train, y_train)


In [8]:
clf.score(X_test,y_test)

0.9133333333333333

In [9]:
# Perform k-Fold Cross-Validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
kf_scores = cross_val_score(clf, X, y, cv=kf)
kf_scores

array([0.915, 0.925, 0.91 , 0.9  , 0.885])

In [10]:
# Perform Stratified k-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
skf_scores = cross_val_score(clf, X, y, cv=skf)
skf_scores

array([0.945, 0.92 , 0.89 , 0.885, 0.9  ])