In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBClassifier

# Load your dataset
train = pd.read_csv('D:/Purdue-MSBAIM/Module3/CCAC Competition/DIWBB_Training.csv')
test = pd.read_csv('D:/Purdue-MSBAIM/Module3/CCAC Competition/DIWBB_Test.csv')



  train = pd.read_csv('D:/Purdue-MSBAIM/Module3/CCAC Competition/DIWBB_Training.csv')


In [2]:
# Convert datetime columns
train['CustomerFirstWBBActionDate'] = pd.to_datetime(train['CustomerFirstWBBActionDate'])
train['Year'] = train['CustomerFirstWBBActionDate'].dt.year
train['Month'] = train['CustomerFirstWBBActionDate'].dt.month
train['Day'] = train['CustomerFirstWBBActionDate'].dt.day
train['DayOfWeek'] = train['CustomerFirstWBBActionDate'].dt.dayofweek

# Drop columns that are less relevant or have a lot of unique values
train.drop(columns=['RecordID', 'CustomerID'], inplace=True)

# Define categorical and numerical columns
categorical_cols = [col for col in train.columns if train[col].dtype == "object" and col != 'ActivityType']
numerical_cols = [col for col in train.columns if train[col].dtype in ['int64', 'float64']]

# Preprocessing for numerical and categorical data
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train['ActivityType'])

# Define the model
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Create the preprocessing and modeling pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Split data into features and target
X = train.drop('ActivityType', axis=1)
y = y_encoded

# Split data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# Hyperparameter tuning setup
param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 5, 7]
    # Add other parameters here
}

# Grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)

# Model evaluation using cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='accuracy')
print("Cross-validated scores: ", cv_scores)

Best hyperparameters:  {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}
Cross-validated scores:  [0.98736083 0.98695434 0.98724106 0.98817289 0.98707381]


In [3]:
from sklearn.metrics import classification_report, confusion_matrix

# Ensure to use the best estimator from the grid search for predictions
best_model = grid_search.best_estimator_

# Predict on the training set
y_pred_train = best_model.predict(X_train)

# Predict on the validation set
y_pred_valid = best_model.predict(X_valid)

# Evaluate the model on the training set
print("Classification Report (Training Set):")
print(classification_report(y_train, y_pred_train))

print("Confusion Matrix (Training Set):")
print(confusion_matrix(y_train, y_pred_train))

# Evaluate the model on the validation set
print("\nClassification Report (Validation Set):")
print(classification_report(y_valid, y_pred_valid))

print("Confusion Matrix (Validation Set):")
print(confusion_matrix(y_valid, y_pred_valid))

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.64      0.16      0.26       526
           1       1.00      1.00      1.00    153001
           2       0.94      0.17      0.29        87
           3       0.92      0.97      0.95     10449
           4       0.91      0.11      0.19       703
           5       0.71      0.87      0.78      2646

    accuracy                           0.99    167412
   macro avg       0.85      0.55      0.58    167412
weighted avg       0.99      0.99      0.99    167412

Confusion Matrix (Training Set):
[[    85      0      0    137      0    304]
 [     0 153001      0      0      0      0]
 [     1      0     15     29      1     41]
 [     7      0      0  10115      2    325]
 [     0      0      0    354     75    274]
 [    39      0      1    303      4   2299]]

Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       

In [4]:
# Preprocess the test dataset
test['CustomerFirstWBBActionDate'] = pd.to_datetime(test['CustomerFirstWBBActionDate'])
test['Year'] = test['CustomerFirstWBBActionDate'].dt.year
test['Month'] = test['CustomerFirstWBBActionDate'].dt.month
test['Day'] = test['CustomerFirstWBBActionDate'].dt.day
test['DayOfWeek'] = test['CustomerFirstWBBActionDate'].dt.dayofweek  # Add this line
test.drop(columns=['RecordID', 'CustomerID', 'CustomerFirstWBBActionDate'], inplace=True)

# Now apply the preprocessing and model pipeline to the test set
# No need to manually apply the preprocessor as the pipeline handles it
test_predictions_encoded = grid_search.best_estimator_.predict(test)

# Inverse transform the predictions to get original labels
test_predictions = label_encoder.inverse_transform(test_predictions_encoded)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame({'ActivityType_Predictions': test_predictions})

# Save the predictions to a CSV file
predictions_file_path = 'D:/Purdue-MSBAIM/Module3/CCAC Competition/test_predictions_XGB_new.csv'
predictions_df.to_csv(predictions_file_path, index=False)

print(f"Predictions saved to {predictions_file_path}")

Predictions saved to D:/Purdue-MSBAIM/Module3/CCAC Competition/test_predictions_XGB_new.csv
