In [1]:
import pandas as pd
import time
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils import class_weight

In [2]:
train = pd.read_csv('/kaggle/input/Cascade_cup/train_age_dataset.csv')
test = pd.read_csv('/kaggle/input/Cascade_cup/test_age_dataset.csv')
sample = pd.read_csv('/kaggle/input/Cascade_cup/sample_submission.csv')

In [3]:
train = pd.get_dummies(train, columns=['tier','gender'])
test = pd.get_dummies(test, columns=['tier','gender'])

# REMEMBER TO ADD 1 IN THE END

train['age_group'] = train['age_group']-1

In [4]:
# ,'tier_1','tier_3','num_of_hashtags_per_action','kfold'

In [5]:
train['zero creations'] = np.where(train.creations == 0, 1, 0)

In [6]:
test['zero creations'] = np.where(test.creations == 0, 1, 0)

In [7]:
feature_cols = [col for col in train.columns.tolist() if col not in ['age_group','gender_2','tier_1','tier_3','num_of_hashtags_per_action','creations','kfold','avgDuration','avgCompletion']]
target_cols = ['age_group']

In [8]:
# KFOLD
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train[target_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [9]:
class_weights = list(class_weight.compute_class_weight('balanced',
                                             np.unique(train['age_group']),
                                             train['age_group']))

1         1
2         0
3         0
4         0
         ..
488872    0
488873    3
488874    3
488875    0
488876    0
Name: age_group, Length: 488877, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [10]:
def run_training():
    oof = np.zeros((train.shape[0], 4))
    pred = np.zeros((test.shape[0], 4))
    
    for fold in range(5):
        
        print(f"\nStarting FOLD: {fold}")
        start = time.time()
        

        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        xtest = test[feature_cols].values

        model = xgb.XGBClassifier(n_estimators = 500, reg_alpha=0.7)
        print("Fitting model ...")
        model.fit(xtr, ytr.reshape(-1,))
        
        print("Calculating training preds ...")
        train_preds = model.predict(xtr)
        print("Calculating validation preds ...")
        val_preds = model.predict(xval)
        print("Training Accuracy Score - ", accuracy_score(ytr, train_preds ))
        print("Training F1 Score - ", f1_score(ytr, train_preds, average='weighted'))
        
        print("Validation Accuracy Score - ", accuracy_score(yval, val_preds))
        print("Validation F1 Score - ", f1_score(yval, val_preds, average='weighted'))
        
        oof[val_idx, :] += model.predict_proba(xval)
        
        # Test preds
        pred += model.predict_proba(xtest)/5
        
        print(f"FOLD {fold} completed in {time.time()-start} seconds")
        
    return oof, pred

In [11]:
oof_xgb, pred_xgb = run_training()   
#100 trees - 79.00                                  
#150 trees - 79.41                                 
#200 trees - 79.79                                  
#300 trees - unregularized               - 80.17                   
#300 trees - reg_alpha=0.1, reg_lambda=2 - 80.23   
#300 trees - reg_alpha=0.2, reg_lambda=5 - 80.24  -- OOF - 80.26 
#300 trees - reg_alpha=1.5, reg_lambda=5 - 80.20   
#400 trees - unregularized               - 80.33
#500 trees - unregularized               - 80.44
#500 tress - reg_alpha=0.5, reg_lambda=5 - 80.53  -- OOF - 80.56 -- might be overfit 
#500 tress - reg_alpha=0.8, reg_lambda=5 - 80.44  -- OOF - 80.53
#500 trees - reg_alpha=0.7               - 80.47  -- OOF - 80.54 -- overfit
#500 trees - reg_alpha=0.7, reg_lambda=15 - 80.39 -- OOF - 80.44 -- overfit LB-80.86 
#500 trees - reg_alpha=5, max_depth=7    - 80.52  -- OOF - 80.57 -- overfit LB-80.915 


Starting FOLD: 0
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.9047279347278581
Training F1 Score -  0.9009538167099721
Validation Accuracy Score -  0.8141363933889707
Validation F1 Score -  0.8063147035796259
FOLD 0 completed in 678.6991500854492 seconds

Starting FOLD: 1
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.9039915520543287
Training F1 Score -  0.9002007581154589
Validation Accuracy Score -  0.8148114056619211
Validation F1 Score -  0.8072486591103194
FOLD 1 completed in 668.931955575943 seconds

Starting FOLD: 2
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.9045159574740094
Training F1 Score -  0.9007246495892937
Validation Accuracy Score -  0.815392482740987
Validation F1 Score -  0.807447003277219
FOLD 2 completed in 693.6492130756378 seconds

Starting FOLD: 3
Fitting model ...
C

In [12]:
def find_oof_score(oof):
    predictions = oof.argmax(axis=1)
    print("OOF Accuracy Score - ", accuracy_score(train[target_cols], predictions))
    print("OOF F1 Score - ", f1_score(train[target_cols], predictions, average='weighted'))

In [13]:
find_oof_score(oof_xgb) # 80.26

OOF Accuracy Score -  0.8144318509563755
OOF F1 Score -  0.8066411146358526


In [14]:
pred_xgb.shape

(54320, 4)

In [15]:
final_preds = pred_xgb.argmax(axis=1)+1
pred_csv = pd.DataFrame(final_preds.reshape(-1), columns=['prediction'] )
pred_csv

Unnamed: 0,prediction
0,1
1,1
2,1
3,3
4,1
...,...
54315,1
54316,1
54317,4
54318,1


In [16]:
pred_csv.value_counts()

prediction
1             34951
2              8175
3              7817
4              3377
dtype: int64

In [17]:
pred_csv.to_csv('newsub7.csv', index=False)

In [18]:
np.save('oof_xgb_8066_8100.npy', oof_xgb)
np.save('pred_xgb_8066_8100.npy', pred_xgb)