In [1]:
import pandas as pd
import time
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

In [10]:
train = pd.read_csv('/kaggle/input/Cascade_cup/train_age_dataset.csv')
test = pd.read_csv('/kaggle/input/Cascade_cup/test_age_dataset.csv')
sample = pd.read_csv('/kaggle/input/Cascade_cup/sample_submission.csv')

In [11]:
train = pd.get_dummies(train, columns=['tier','gender'])
test = pd.get_dummies(test, columns=['tier','gender'])

# REMEMBER TO ADD 1 IN THE END

train['age_group'] = train['age_group']-1

In [12]:
feature_cols = [col for col in train.columns.tolist() if col not in ['age_group','gender_2', 'tier_1','tier_3',
                                                                    'num_of_hashtags_per_action','avgDuration','avgCompletion']]
target_cols = ['age_group']

In [13]:
# KFOLD
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train[target_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [14]:
def run_training():
    oof = np.zeros((train.shape[0], 4))
    pred = np.zeros((test.shape[0], 4))
    
    for fold in range(5):
        
        print(f"\nStarting FOLD: {fold}")
        start = time.time()
        

        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        xtest = test[feature_cols].values

        model = xgb.XGBClassifier(n_estimators = 500, reg_alpha=0.7)
        print("Fitting model ...")
        model.fit(xtr, ytr.reshape(-1,))
        
        print("Calculating training preds ...")
        train_preds = model.predict(xtr)
        print("Calculating validation preds ...")
        val_preds = model.predict(xval)
        print("Training Accuracy Score - ", accuracy_score(ytr, train_preds ))
        print("Training F1 Score - ", f1_score(ytr, train_preds, average='weighted'))
        
        print("Validation Accuracy Score - ", accuracy_score(yval, val_preds))
        print("Validation F1 Score - ", f1_score(yval, val_preds, average='weighted'))
        
        oof[val_idx, :] += model.predict_proba(xval)
        
        # Test preds
        pred += model.predict_proba(xtest)/5
        
        print(f"FOLD {fold} completed in {time.time()-start} seconds")
        
    return oof, pred

In [15]:
oof_xgb, pred_xgb = run_training()   
#100 trees - 79.00                                  
#150 trees - 79.41                                 
#200 trees - 79.79                                  
#300 trees - unregularized               - 80.17                   
#300 trees - reg_alpha=0.1, reg_lambda=2 - 80.23   
#300 trees - reg_alpha=0.2, reg_lambda=5 - 80.24  -- OOF - 80.26 
#300 trees - reg_alpha=1.5, reg_lambda=5 - 80.20   
#400 trees - unregularized               - 80.33
#500 trees - unregularized               - 80.44
#500 tress - reg_alpha=0.5, reg_lambda=5 - 80.53  -- OOF - 80.56 -- LB 80.89 -- overfit to 1st fold
#500 tress - reg_alpha=0.8, reg_lambda=5 - 80.44  -- OOF - 80.53 -- LB 80.97
#500 trees - reg_alpha=0.7, reg_lambda=0 - 80.47 -- OOF - 80.54 -- LB - 81.01 ----> without two features - 81.05
#500 trees - reg_alpha=1.0, reg_lambda=10 - 80.32 -- OOF - 80.15 -- LB - 80.91


Starting FOLD: 0
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.9065356519160013
Training F1 Score -  0.9028607210941959
Validation Accuracy Score -  0.8127556864670267
Validation F1 Score -  0.8054234769847355
FOLD 0 completed in 699.2805681228638 seconds

Starting FOLD: 1
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.9058222812010197
Training F1 Score -  0.9020828609940348
Validation Accuracy Score -  0.8144534446080838
Validation F1 Score -  0.806790732315977
FOLD 1 completed in 703.4214100837708 seconds

Starting FOLD: 2
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.9071239727743657
Training F1 Score -  0.9034735854958297
Validation Accuracy Score -  0.8144720020455126
Validation F1 Score -  0.8068212974898973
FOLD 2 completed in 713.6055400371552 seconds

Starting FOLD: 3
Fitting model ...

In [16]:
def find_oof_score(oof):
    predictions = oof.argmax(axis=1)
    print("OOF Accuracy Score - ", accuracy_score(train[target_cols], predictions))
    print("OOF F1 Score - ", f1_score(train[target_cols], predictions, average='weighted'))

In [17]:
find_oof_score(oof_xgb) # 80.26

OOF Accuracy Score -  0.8139225203885639
OOF F1 Score -  0.8063599626965134


In [18]:
pred_xgb.shape

(54320, 4)

In [19]:
final_preds = pred_xgb.argmax(axis=1)+1
pred_csv = pd.DataFrame(final_preds.reshape(-1), columns=['prediction'] )
pred_csv

Unnamed: 0,prediction
0,1
1,1
2,1
3,3
4,1
...,...
54315,1
54316,1
54317,4
54318,1


In [20]:
pred_csv.value_counts()

prediction
1             34890
2              8212
3              7815
4              3403
dtype: int64

In [21]:
pred_csv.to_csv('featureremoved.csv', index=False)

In [22]:
np.save('oof_xgb.npy', oof_xgb)
np.save('pred_xgb.npy', pred_xgb)