In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from catboost import CatBoostClassifier
# import training_preprocess



In [2]:
# using the preprocessed data
from data_reprocessing import train_to_df
train_file = 'balanced_train.csv'  
dataset = train_to_df(train_file)
# Display the first few rows of the dataset to verify the preprocessing
dataset.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,second
0,50512,15,1,19,245,0,8,21,16,35
1,80510,3,1,25,489,0,9,5,24,16
2,1768,1,1,19,439,0,8,13,46,1
3,30587,15,1,31,386,0,6,18,19,13
4,73487,3,1,8,153,0,8,3,44,35


In [3]:
# Split the data into features and target
X = dataset.drop('is_attributed', axis=1)
y = dataset['is_attributed']

# Split the dataset into 80% training and 20% validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into training and validation sets.")
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


Data split into training and validation sets.
Training set shape: (1355291, 9)
Validation set shape: (338823, 9)


In [4]:
# Initialize CatBoost Classifier
catboost_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='AUC',
    random_seed=42,
    verbose=100
)

print("Starting model training...")

# Train the CatBoost model
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

print("Model training completed.")

# Save the trained CatBoost model to a JSON file
catboost_model.save_model('catboost_balanced_model.json', format='json')

print("Model saved as 'catboost_balanced_model.json'.")


Starting model training...
0:	test: 0.9153186	best: 0.9153186 (0)	total: 213ms	remaining: 3m 33s
100:	test: 0.9641275	best: 0.9641275 (100)	total: 5.01s	remaining: 44.6s
200:	test: 0.9671435	best: 0.9671435 (200)	total: 9.71s	remaining: 38.6s
300:	test: 0.9684705	best: 0.9684705 (300)	total: 14.4s	remaining: 33.5s
400:	test: 0.9692446	best: 0.9692446 (400)	total: 19.1s	remaining: 28.5s
500:	test: 0.9698688	best: 0.9698688 (500)	total: 23.6s	remaining: 23.5s
600:	test: 0.9702690	best: 0.9702690 (600)	total: 28.2s	remaining: 18.7s
700:	test: 0.9705101	best: 0.9705101 (699)	total: 32.8s	remaining: 14s
800:	test: 0.9708061	best: 0.9708061 (800)	total: 37.4s	remaining: 9.28s
900:	test: 0.9709735	best: 0.9709735 (900)	total: 41.9s	remaining: 4.6s
999:	test: 0.9711054	best: 0.9711054 (999)	total: 46.5s	remaining: 0us

bestTest = 0.9711054185
bestIteration = 999

Model training completed.
Model saved as 'catboost_balanced_model.json'.
