In [1]:
import pandas as pd
import time
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

In [2]:
train = pd.read_csv('/kaggle/input/Cascade_cup/train_age_dataset.csv')
test = pd.read_csv('/kaggle/input/Cascade_cup/test_age_dataset.csv')
sample = pd.read_csv('/kaggle/input/Cascade_cup/sample_submission.csv')

In [3]:
train = pd.get_dummies(train, columns=['tier','gender'])
test = pd.get_dummies(test, columns=['tier','gender'])

# REMEMBER TO ADD 1 IN THE END

train['age_group'] = train['age_group']-1

In [4]:
feature_cols = [col for col in train.columns.tolist() if col not in ['age_group','gender_2']]
target_cols = ['age_group']

In [5]:
# KFOLD
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train[target_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [6]:
# A MODEL WITH 2 XGB CLASSIFIERS
class DoubleXGB():
    def __init__(self):
        self.mod1 = xgb.XGBClassifier(tree_method='gpu_hist')
        self.mod2 = xgb.XGBClassifier(tree_method='gpu_hist')
        
    def fit(self, x, y):
        self.mod1.fit(x, y)
        self.mod2.fit(x, y)
        
    def predict_proba(self, x):
        pred1 = self.mod1.predict_proba(x)
        pred2 = self.mod2.predict_proba(x)
        
        return (pred1+pred2)/2
    
    def predict(self, x):
        pred1 = self.mod1.predict_proba(x)
        pred2 = self.mod2.predict_proba(x)
        
        pred = (pred1+pred2)/2
        
        pred_classes = np.argmax(pred, axis=1)
        return pred_classes
        

In [7]:
def run_training():
    oof = np.zeros((train.shape[0], 4))
    pred = np.zeros((test.shape[0], 4))
    
    for fold in range(5):
        
        print(f"\nStarting FOLD: {fold}")
        start = time.time()
        

        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        xtest = test[feature_cols].values

        model = xgb.XGBClassifier()
        print("Fittting model ...")
        model.fit(xtr, ytr.reshape(-1,))
        
        print("Calculating training preds ...")
        train_preds = model.predict(xtr)
        print("Calculating validation preds ...")
        val_preds = model.predict(xval)
        print("Training Accuracy Score - ", accuracy_score(ytr, train_preds ))
        print("Training F1 Score - ", f1_score(ytr, train_preds, average='weighted'))
        
        print("Validation Accuracy Score - ", accuracy_score(yval, val_preds))
        print("Validation F1 Score - ", f1_score(yval, val_preds, average='weighted'))
        
        oof[val_idx, :] += model.predict_proba(xval)
        
        # Test preds
        pred += model.predict_proba(xtest)/5
        
        print(f"FOLD {fold} completed in {time.time()-start} seconds")
        
    return oof, pred

In [8]:
oof_xgb, pred_xgb = run_training()


Starting FOLD: 0
Fittting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.8241119301663764
Training F1 Score -  0.818573935066147
Validation Accuracy Score -  0.7963712976599574
Validation F1 Score -  0.7900617637782085
FOLD 0 completed in 311.6262857913971 seconds

Starting FOLD: 1
Fittting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.8215550458832884
Training F1 Score -  0.8159966763113596
Validation Accuracy Score -  0.7957678775977745
Validation F1 Score -  0.7897380371074988
FOLD 1 completed in 306.42846727371216 seconds

Starting FOLD: 2
Fittting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.822672857719981
Training F1 Score -  0.817189915415303
Validation Accuracy Score -  0.7960521605727435
Validation F1 Score -  0.7896293727218838
FOLD 2 completed in 299.2382860183716 seconds

Starting FOLD: 3
Fittting model 

In [9]:
def find_oof_score(oof):
    predictions = oof.argmax(axis=1)
    print("OOF Accuracy Score - ", accuracy_score(train[target_cols], predictions))
    print("OOF F1 Score - ", f1_score(train[target_cols], predictions, average='weighted'))

In [10]:
find_oof_score(oof_xgb)

OOF Accuracy Score -  0.7962125442595991
OOF F1 Score -  0.7899487480513191


In [11]:
pred_xgb.shape

(54320, 4)

In [12]:
final_preds = pred_xgb.argmax(axis=1)+1
pred_csv = pd.DataFrame(final_preds.reshape(-1), columns=['prediction'] )
pred_csv

Unnamed: 0,prediction
0,1
1,1
2,1
3,2
4,1
...,...
54315,1
54316,1
54317,4
54318,1


In [13]:
pred_csv.value_counts()

prediction
1             34422
2              8454
3              8327
4              3117
dtype: int64

In [14]:
pred_csv.to_csv('submission.csv', index=False)

In [15]:
np.save('oof_xgb.npy', oof_xgb)
np.save('pred_xgb.npy', pred_xgb)