In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cascade-cup-debye/Cascade_cup/sample_submission.csv
/kaggle/input/cascade-cup-debye/Cascade_cup/test_age_dataset.csv
/kaggle/input/cascade-cup-debye/Cascade_cup/train_age_dataset.csv


In [2]:
import pandas as pd
import time
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [3]:
df_train = pd.read_csv("../input/Cascade_cup/train_age_dataset.csv")
df_test = pd.read_csv("../input/Cascade_cup/test_age_dataset.csv")

In [4]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [5]:
cat_columns = ['tier','gender']

In [6]:
encoded_train = ohe.fit_transform(df_train[cat_columns])
encoded_test = ohe.transform(df_test[cat_columns])
encoded_df_train = pd.DataFrame(encoded_train,columns=ohe.get_feature_names(cat_columns))
encoded_df_train.index = df_train.index
encoded_df_test = pd.DataFrame(encoded_test,columns=ohe.get_feature_names(cat_columns))
encoded_df_test.index = df_test.index
train = pd.concat([df_train.drop(cat_columns,axis=1),encoded_df_train],axis=1)
test = pd.concat([df_test.drop(cat_columns,axis=1),encoded_df_test],axis=1)

In [7]:
feature_cols = [col for col in train.columns.tolist() if col not in ['Unnamed: 0','userId','age_group','tier_3','gender_1','avgComments','avgDuration','avgCompletion','punctuations_per_action','emoji_count_per_action','num_of_hashtags_per_action']]
target_cols = ['age_group']

In [8]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train[target_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [9]:
# A MODEL WITH 2 XGB CLASSIFIERS
class DoubleXGB():
    def __init__(self):
        self.mod1 = xgb.XGBClassifier(reg_alpha=10 ,random_state=42, 
                                  tree_method='gpu_hist', max_depth=9, 
                                  n_estimators=1000, learning_rate=0.02)
        self.mod2 = xgb.XGBClassifier(reg_alpha=10 ,random_state=48, 
                                  tree_method='gpu_hist', max_depth=9, 
                                  n_estimators=1000, learning_rate=0.02)
        
    def fit(self, x, y):
        self.mod1.fit(x, y)
        self.mod2.fit(x, y)
        
    def predict_proba(self, x):
        pred1 = self.mod1.predict_proba(x)
        pred2 = self.mod2.predict_proba(x)
        
        return (pred1+pred2)/2
    
    def predict(self, x):
        pred1 = self.mod1.predict_proba(x)
        pred2 = self.mod2.predict_proba(x)
        
        pred = (pred1+pred2)/2
        
        pred_classes = np.argmax(pred, axis=1)
        return pred_classes

In [10]:
def run_training():
    oof = np.zeros((train.shape[0], 4))
    pred = np.zeros((test.shape[0], 4))
    
    for fold in range(5):

        print(f"\nStarting FOLD: {fold}")
        start = time.time()
        

        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_cols].values
        xval, yval = val[feature_cols].values, val[target_cols].values
        xtest = test[feature_cols].values

        model = DoubleXGB()
        
        model.fit(xtr, ytr.reshape(-1,))
        print("Training Accuracy Score - ", accuracy_score(ytr, model.predict(xtr)))
        print("Training F1 Score - ", f1_score(ytr, model.predict(xtr), average='weighted'))
        
        print("Validation Accuracy Score - ", accuracy_score(yval, model.predict(xval)))
        print("Validation F1 Score - ", f1_score(yval, model.predict(xval), average='weighted'))
        
        oof[val_idx, :] += model.predict_proba(xval)
        
        # Test preds
        pred += model.predict_proba(xtest)/5
        
        print(f"FOLD {fold} completed in {time.time()-start} seconds")
        
    return oof, pred

In [11]:
oof, pred = run_training()


Starting FOLD: 0
Training Accuracy Score -  0.0548042577237082
Training F1 Score -  0.07313845450317133
Validation Accuracy Score -  0.0686262477499591
Validation F1 Score -  0.09096046047656725
FOLD 0 completed in 185.84431219100952 seconds

Starting FOLD: 1
Training Accuracy Score -  0.05407810258731121
Training F1 Score -  0.07226104796422816
Validation Accuracy Score -  0.06753190967108492
Validation F1 Score -  0.08901472466349014
FOLD 1 completed in 181.6905014514923 seconds

Starting FOLD: 2
Training Accuracy Score -  0.05458678298755823
Training F1 Score -  0.07311564393275612
Validation Accuracy Score -  0.06837126054717464
Validation F1 Score -  0.09027097769725702
FOLD 2 completed in 185.16882991790771 seconds

Starting FOLD: 3
Training Accuracy Score -  0.053886198485305625
Training F1 Score -  0.07217003884145973
Validation Accuracy Score -  0.0676144208642291
Validation F1 Score -  0.08909477058640639
FOLD 3 completed in 187.36898159980774 seconds

Starting FOLD: 4
Train

In [12]:
def find_oof_score(oof):
    predictions = oof.argmax(axis=1)+1
    print("OOF Accuracy Score - ", accuracy_score(train[target_cols], predictions))
    print("OOF F1 Score - ", f1_score(train[target_cols], predictions, average='weighted'))

In [13]:
find_oof_score(oof)

OOF Accuracy Score -  0.7489716227190071
OOF F1 Score -  0.74824771829688


In [14]:
final_preds = pred.argmax(axis=1) + 1

In [15]:
pred_csv = pd.DataFrame(final_preds.reshape(-1), columns=['prediction'])
pred_csv.value_counts()

prediction
1             33981
2              8752
3              6609
4              4978
dtype: int64

In [16]:
pred_csv.to_csv('Submission5.csv', index=False)

In [17]:
np.save('oof_xgb.npy', oof)
np.save('pred_xgb.npy', pred)