In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

# Load datasets
train_data = pd.read_csv('/kaggle/input/506comp/train.csv')
test_data = pd.read_csv('/kaggle/input/506comp/test.csv')

In [23]:
# Feature extraction: Process date and time features
train_data['timestamp'] = pd.to_datetime(train_data['trans_date']).astype('int64') // 10**9
test_data['timestamp'] = pd.to_datetime(test_data['trans_date']).astype('int64') // 10**9

train_data['transaction_hour'] = pd.to_datetime(train_data['trans_time']).dt.hour
test_data['transaction_hour'] = pd.to_datetime(test_data['trans_time']).dt.hour

train_data['transaction_minute'] = pd.to_datetime(train_data['trans_time']).dt.minute
test_data['transaction_minute'] = pd.to_datetime(test_data['trans_time']).dt.minute

train_data['customer_age'] = pd.to_datetime('today').year - pd.to_datetime(train_data['dob']).dt.year
test_data['customer_age'] = pd.to_datetime('today').year - pd.to_datetime(test_data['dob']).dt.year

# Remove unnecessary columns
train_data.drop(columns=['trans_time', 'dob'], inplace=True, errors='ignore')
test_data.drop(columns=['trans_time', 'dob'], inplace=True, errors='ignore')

In [27]:
 #Prepare training and test sets
X_train = train_data.drop(columns=['is_fraud', 'id', 'trans_num'])
y_train = train_data['is_fraud']
X_test = test_data.drop(columns=['id', 'trans_num'])

# Ensure consistent column alignment between training and test data
X_test = X_test[X_train.columns]

# Encode categorical variables
categorical_columns = X_train.select_dtypes(include=['object']).columns
encoders = {}

for column in categorical_columns:
    encoder = LabelEncoder()
    X_train[column] = encoder.fit_transform(X_train[column].astype(str))
    X_test[column] = X_test[column].astype(str).apply(lambda x: x if x in encoder.classes_ else 'Unknown')
    encoder.classes_ = np.append(encoder.classes_, 'Unknown')
    X_test[column] = encoder.transform(X_test[column])
    encoders[column] = encoder

# Implement Stratified K-Fold Cross-Validation
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
out_of_fold_predictions = np.zeros(X_train.shape[0])
final_test_predictions = np.zeros(X_test.shape[0])

In [28]:
# Configure the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=2,  # Adjusted for imbalanced data
    learning_rate=0.05,
    n_estimators=1000,
    max_depth=10,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

In [30]:
# Train the model using cross-validation
for fold_number, (train_indices, validation_indices) in enumerate(folds.split(X_train, y_train)):
    print(f"Training fold {fold_number + 1}...")
    X_train_fold, X_validation_fold = X_train.iloc[train_indices], X_train.iloc[validation_indices]
    y_train_fold, y_validation_fold = y_train.iloc[train_indices], y_train.iloc[validation_indices]

    xgb_model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_validation_fold, y_validation_fold)],
        early_stopping_rounds=50,
        verbose=100
    )

    # Generate predictions for the validation set and test data
    out_of_fold_predictions[validation_indices] = xgb_model.predict_proba(X_validation_fold)[:, 1]
    final_test_predictions += xgb_model.predict_proba(X_test)[:, 1] / folds.n_splits

# Evaluate performance on validation data
binary_predictions = (out_of_fold_predictions > 0.5).astype(int)
validation_f1_score = f1_score(y_train, binary_predictions)
print(f"Cross-Validated F1 Score: {validation_f1_score:.4f}")

# Determine feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Top 10 Features:")
print(feature_importance.head(10))

Training fold 1...
[0]	validation_0-logloss:0.37675
[100]	validation_0-logloss:0.02630
[200]	validation_0-logloss:0.01913
[300]	validation_0-logloss:0.01636
[400]	validation_0-logloss:0.01542
[500]	validation_0-logloss:0.01500
[600]	validation_0-logloss:0.01480
[651]	validation_0-logloss:0.01479
Training fold 2...
[0]	validation_0-logloss:0.37669
[100]	validation_0-logloss:0.02647
[200]	validation_0-logloss:0.01910
[300]	validation_0-logloss:0.01725
[400]	validation_0-logloss:0.01665
[500]	validation_0-logloss:0.01620
[579]	validation_0-logloss:0.01619
Training fold 3...
[0]	validation_0-logloss:0.37695
[100]	validation_0-logloss:0.02657
[200]	validation_0-logloss:0.01958
[300]	validation_0-logloss:0.01744
[400]	validation_0-logloss:0.01663
[500]	validation_0-logloss:0.01631
[600]	validation_0-logloss:0.01611
[654]	validation_0-logloss:0.01615
Training fold 4...
[0]	validation_0-logloss:0.37675
[100]	validation_0-logloss:0.02658
[200]	validation_0-logloss:0.01909
[300]	validation_0-log

In [32]:
# Generate submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'is_fraud': (final_test_predictions > 0.5).astype(int)
})
submission.to_csv('submission_final.csv', index=False)
print("Submission file saved as 'submission_final.csv'")


Submission file saved as 'submission_final.csv'
