In [1]:
import pandas as pd
import time
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

In [2]:
train = pd.read_csv('/kaggle/input/Cascade_cup/train_age_dataset.csv')
test = pd.read_csv('/kaggle/input/Cascade_cup/test_age_dataset.csv')
sample = pd.read_csv('/kaggle/input/Cascade_cup/sample_submission.csv')

In [3]:
full_train = train.copy()
train = train[train['creations'] != 0].reset_index(drop=True)

In [4]:
train

Unnamed: 0.1,Unnamed: 0,userId,tier,gender,following_rate,followers_avg_age,following_avg_age,max_repetitive_punc,num_of_hashtags_per_action,emoji_count_per_action,...,content_views,num_of_comments,weekends_trails_watched_per_day,weekdays_trails_watched_per_day,slot1_trails_watched_per_day,slot2_trails_watched_per_day,slot3_trails_watched_per_day,slot4_trails_watched_per_day,avgt2,age_group
0,405231,51100441,2,2,0.000000,0.000,0.000000,0,0.0,0.0,...,0.093220,0.000000,0.012712,0.018644,0.000000,0.084746,0.000000,0.033898,82.500000,2
1,18167,32824244,1,1,0.000000,0.000,0.000000,0,0.0,0.0,...,0.078740,0.000000,0.000000,0.015748,0.070866,0.000000,0.007874,0.000000,128.200000,1
2,18705,33815341,1,2,0.000000,0.000,0.000000,0,0.0,0.0,...,0.047244,0.000000,0.031496,0.000000,0.000000,0.062992,0.000000,0.000000,184.781250,3
3,498266,7900978,2,1,0.013245,0.000,0.000000,0,0.0,0.0,...,1.096026,0.000000,0.000000,0.215232,0.195364,0.149007,0.125828,0.605960,155.562500,4
4,25008,36540948,1,1,0.007937,0.000,0.000000,0,0.0,0.0,...,0.047619,0.000000,0.000000,0.009524,0.000000,0.000000,0.000000,0.047619,345.333333,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240546,538317,37562999,2,1,0.008000,1.000,3.000000,0,0.0,0.0,...,0.080000,0.000000,0.032000,0.003200,0.000000,0.064000,0.008000,0.008000,266.666667,3
240547,26772,34997641,1,1,0.103175,1.875,2.333333,0,0.0,0.0,...,0.404762,0.000000,0.115079,0.066667,0.000000,0.261905,0.269841,0.031746,211.411765,3
240548,535266,38983571,2,2,0.000000,0.000,0.000000,0,0.0,0.0,...,0.064000,0.000000,0.012000,0.006400,0.000000,0.008000,0.000000,0.048000,195.000000,1
240549,36706,53264639,1,1,0.017241,0.000,0.000000,0,0.0,0.0,...,2.043103,0.017241,0.301724,0.339655,0.068966,0.922414,0.137931,1.172414,188.742857,4


In [5]:
train = pd.get_dummies(train, columns=['tier','gender'])
test = pd.get_dummies(test, columns=['tier','gender'])

# REMEMBER TO ADD 1 IN THE END

train['age_group'] = train['age_group']-1

In [6]:
feature_cols = [col for col in train.columns.tolist() if col not in ['age_group','gender_2', 'tier_1','tier_3',
                                                                    'num_of_hashtags_per_action','avgDuration','avgCompletion']]
target_cols = ['age_group']

In [7]:
# KFOLD
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train[target_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [8]:
def run_training():
    oof = np.zeros((train.shape[0], 4))
    pred = np.zeros((test.shape[0], 4))
    
    for fold in range(5):
        
        print(f"\nStarting FOLD: {fold}")
        start = time.time()
        

        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        xtest = test[feature_cols].values

        model = xgb.XGBClassifier(n_estimators = 500, reg_alpha=0.7)
        print("Fitting model ...")
        model.fit(xtr, ytr.reshape(-1,))
        
        print("Calculating training preds ...")
        train_preds = model.predict(xtr)
        print("Calculating validation preds ...")
        val_preds = model.predict(xval)
        print("Training Accuracy Score - ", accuracy_score(ytr, train_preds ))
        print("Training F1 Score - ", f1_score(ytr, train_preds, average='weighted'))
        
        print("Validation Accuracy Score - ", accuracy_score(yval, val_preds))
        print("Validation F1 Score - ", f1_score(yval, val_preds, average='weighted'))
        
        oof[val_idx, :] += model.predict_proba(xval)
        
        # Test preds
        pred += model.predict_proba(xtest)/5
        
        print(f"FOLD {fold} completed in {time.time()-start} seconds")
        
    return oof, pred

In [9]:
oof_xgb, pred_xgb = run_training()   
#100 trees - 79.00                                  
#150 trees - 79.41                                 
#200 trees - 79.79                                  
#300 trees - unregularized               - 80.17                   
#300 trees - reg_alpha=0.1, reg_lambda=2 - 80.23   
#300 trees - reg_alpha=0.2, reg_lambda=5 - 80.24  -- OOF - 80.26 
#300 trees - reg_alpha=1.5, reg_lambda=5 - 80.20   
#400 trees - unregularized               - 80.33
#500 trees - unregularized               - 80.44
#500 tress - reg_alpha=0.5, reg_lambda=5 - 80.53  -- OOF - 80.56 -- LB 80.89 -- overfit to 1st fold
#500 tress - reg_alpha=0.8, reg_lambda=5 - 80.44  -- OOF - 80.53 -- LB 80.97
#500 trees - reg_alpha=0.7, reg_lambda=0 - 80.47 -- OOF - 80.54 -- LB - 81.01 ----> without two features - 81.05
#500 trees - reg_alpha=1.0, reg_lambda=10 - 80.32 -- OOF - 80.15 -- LB - 80.91


Starting FOLD: 0
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.8087144044897111
Training F1 Score -  0.8026784905338614
Validation Accuracy Score -  0.6197751034067053
Validation F1 Score -  0.6068390544876259
FOLD 0 completed in 367.08520913124084 seconds

Starting FOLD: 1
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.8115058641349816
Training F1 Score -  0.8057614929633286
Validation Accuracy Score -  0.6216171274163376
Validation F1 Score -  0.6094541584687254
FOLD 1 completed in 368.16090416908264 seconds

Starting FOLD: 2
Fitting model ...
Calculating training preds ...
Calculating validation preds ...
Training Accuracy Score -  0.8105757089185777
Training F1 Score -  0.8047027409971795
Validation Accuracy Score -  0.6255664103097069
Validation F1 Score -  0.6132913642086564
FOLD 2 completed in 366.4449577331543 seconds

Starting FOLD: 3
Fitting model 

In [11]:
full_oof = np.zeros((full_train.shape[0], 4))
z_idx = full_train['creations'] == 0
nz_idx = full_train['creations'] != 0

full_oof[nz_idx, :] = oof_xgb
full_oof[z_idx, 0] = 1.0

In [18]:
def find_oof_score(oof):
    predictions = oof.argmax(axis=1)+1
    print("OOF Accuracy Score - ", accuracy_score(full_train[target_cols], predictions))
    print("OOF F1 Score - ", f1_score(full_train[target_cols], predictions, average='weighted'))

In [19]:
find_oof_score(full_oof) # 80.26

OOF Accuracy Score -  0.8139286569014292
OOF F1 Score -  0.8063223170728732


In [20]:
pred_xgb.shape

(54320, 4)

In [21]:
final_preds = pred_xgb.argmax(axis=1)+1
pred_csv = pd.DataFrame(final_preds.reshape(-1), columns=['prediction'] )
pred_csv

Unnamed: 0,prediction
0,3
1,2
2,4
3,2
4,4
...,...
54315,3
54316,1
54317,4
54318,2


In [22]:
pred_csv.value_counts()

prediction
3             20682
4             13734
1             10051
2              9853
dtype: int64

In [24]:
ze_cr = test['creations'] == 0
pred_csv.loc[ze_cr, 'prediction'] = 1

In [25]:
pred_csv.value_counts()

prediction
1             34900
2              8210
3              7810
4              3400
dtype: int64

In [26]:
pred_csv.to_csv('trainedonnonzero.csv', index=False)

In [27]:
full_oof.shape, pred_xgb.shape

((488877, 4), (54320, 4))

In [28]:
np.save('oof_xgb.npy', full_oof)
np.save('pred_xgb.npy', pred_xgb)