In [15]:
# Install packages if needed
!pip install xgboost scikit-learn pandas numpy




In [16]:
# Perform 5-Fold Cross-Validation and measure model accuracy and time
import xgboost as xgb
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import os


In [17]:
datasets = {
    "100": "df_100.csv",
    "1000": "df_1k.csv",
    "10000": "df_10k.csv",
    "100000": "df_100k.csv",
    "1000000": "df_1m.csv"
    # "10000000": "df_10m.csv"  # Uncomment if you upload df_10m.csv
}


In [None]:
# Perform 5-Fold Cross-Validation and measure model accuracy and time
# Model creation with random_state for reproducibility
# Empty list to store results
results = []

# Train XGBoost with 5-fold CV on each dataset
for size, file in datasets.items():
    print(f"Processing {size} rows...")

    # Load data
    df = pd.read_csv(file)
    X = df.drop('outcome', axis=1)
    y = df['outcome']

    # Define model
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    kfold = StratifiedKFold(n_splits=5)

    # Time training
    start = time.time()
    scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    end = time.time()

    # Calculate results
    accuracy = np.mean(scores)
    elapsed = end - start

    # Save results
    results.append(["Python XGBoost + 5fold", size, accuracy, elapsed])


In [19]:
# Create DataFrame
res_df = pd.DataFrame(results, columns=["Method used", "Dataset Size", "Accuracy", "Time Taken"])

# Save to CSV
res_df.to_csv("python_results.csv", index=False)

# Show results
print(res_df)


              Method used Dataset Size  Accuracy  Time Taken
0  Python XGBoost + 5fold          100  0.860000    0.335250
1  Python XGBoost + 5fold         1000  0.946000    2.237376
2  Python XGBoost + 5fold        10000  0.973300    9.927746
3  Python XGBoost + 5fold       100000  0.986940    5.572298
4  Python XGBoost + 5fold      1000000  0.991701   43.797805
