In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import itertools

from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score,precision_recall_curve,auc,roc_auc_score,\
roc_curve,recall_score,classification_report ,f1_score,precision_score

In [2]:
train_df=pd.read_csv("train_submissions.csv")
test_df=pd.read_csv("test_submissions.csv")
problem_data_df=pd.read_csv("problem_data.csv")
user_data_df=pd.read_csv("user_data.csv")

## Problem_Data

In [3]:
def encode_tags(df):
    df['tags'].fillna("UNKNOWN",inplace=True)
    df['tags_test'] = df['tags'].apply(lambda x: [str(i) for i in x.split(',')])
    tag_list = df["tags_test"].tolist()
    
    merged = list(itertools.chain(*tag_list))
    unique_tags_list=list(set(merged))
    
    for i in range(len(unique_tags_list)):
        df[unique_tags_list[i]+"_tag"]=0
    
    for index, row in df.iterrows():
        for j in range(len(row["tags_test"])):
            df.loc[index, row["tags_test"][j]+"_tag"] = 1
            
    df.drop(["tags_test"],inplace=True,axis=1)
    
    return df

In [4]:
problem_data_df=encode_tags(problem_data_df)

### Handling missing values

#### Level_type "I","J","K","L","M","N" have always points=none, Therefore, I assume these level_types have points=0 always

In [5]:
problem_data_df['points'] = np.where(~problem_data_df['level_type'].isin(['I','J','K','L','M','N']), problem_data_df['points'], 0)

### Rest level_type where points is equal to None, are replaced with mean value of level_types' average points value

In [6]:
problem_data_df['points'] = problem_data_df.groupby(['level_type'])['points'].apply(lambda x: x.fillna(x.mean()))

#### Missing values in level_type columns is filled with another category "unknown" and points corresponding those rows are filled with 0 value

In [7]:
problem_data_df['level_type'].fillna('unknown',inplace=True)
problem_data_df['points'].fillna(0,inplace=True)

In [8]:
problem_data_df.shape

(6544, 41)

## User_Data

In [9]:
user_data_df['conversion_rate']=user_data_df['problem_solved']/user_data_df['submission_count']

In [10]:
user_data_df['last_online_time_date'] = pd.to_datetime(user_data_df['last_online_time_seconds'],unit='s')
user_data_df['registration_time_date'] = pd.to_datetime(user_data_df['registration_time_seconds'],unit='s')

In [11]:
user_data_df['days_active']=round((user_data_df['last_online_time_seconds']-user_data_df['registration_time_seconds'])/(60*60*24))

In [12]:
user_data_df['country'].fillna('unknown',inplace=True)

In [13]:
user_data_df.shape

(3571, 15)

### Train and Test Data

In [14]:
train_df['source']='train'
test_df['source']='test'
full_df=pd.concat([train_df,test_df])
full_df=pd.merge(full_df,user_data_df,on=['user_id'])
full_df=pd.merge(full_df,problem_data_df,on=['problem_id'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
def prepare_data(df):
    
    y=df[df['source']=='train']['attempts_range']
    df_encode=pd.get_dummies(df[df['source']=='train'][['level_type','rank']])
    
    X=pd.concat((df[df['source']=='train'].drop(["source","ID",'rank','level_type','attempts_range','country','user_id','problem_id','last_online_time_seconds','registration_time_seconds','last_online_time_date','registration_time_date','tags'],axis=1),df_encode),axis=1)
    
    X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3 ,random_state=0)
    
    df_encode=pd.get_dummies(df[df['source']=='test'][['level_type','rank']])
    df_scaled_test=pd.concat((df[df['source']=='test'].drop(["source","ID",'rank','level_type','country','user_id','problem_id','last_online_time_seconds','registration_time_seconds','last_online_time_date','registration_time_date','tags'],axis=1),df_encode),axis=1)
    
    print(f"Shape of train feature data is: {X_train.shape}")
    print(f"Shape of train target data is: {y_train.shape}")
    print(f"Shape of valid feature data is: {X_valid.shape}")
    print(f"Shape of valid target data is: {y_valid.shape}")
    print(f"Shape of test data is: {df_scaled_test.shape}")
          
    return X_train,X_valid,y_train,y_valid, df_scaled_test
    

In [24]:
X_train,X_valid,y_train,y_valid, df_scaled_test=prepare_data(full_df)

Shape of train feature data is: (108706, 65)
Shape of train target data is: (108706,)
Shape of valid feature data is: (46589, 65)
Shape of valid target data is: (46589,)
Shape of test data is: (66555, 66)


In [25]:
def RadomsearchCV(X,y,model,param_grid,cv,n_jobs=1):
    
    gs = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring='f1_weighted', cv=cv, verbose=2,n_jobs =-1)
    gs = gs.fit(X.values,y.values.ravel())
    
    return gs.best_params_

### RandomForest

In [74]:
param_grid = {
    'n_estimators': [100,150,200,300,400],
    'bootstrap': [True, False],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'max_depth': [7,8,9,10]
    }

In [76]:
params=RadomsearchCV(X_train,y_train,RandomForestClassifier(),param_grid,5,-1)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 out of  50 | elapsed:  1.8min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:  1.9min remaining:    7.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished


In [77]:
model = RandomForestClassifier(**params)
#model.set_params(**params)
model.fit(X_train,y_train)
Prediction=model.predict(X_valid) 

In [79]:
print("__"*50,"\n")
print('The accuracy of the XGB Regression is',metrics.accuracy_score(y_valid,Prediction))
print("__"*50,"\n")
print('The F1 score of the XGB Regression is',metrics.f1_score(y_valid,Prediction,average='weighted'))
print("__"*50,"\n")
print(classification_report(y_valid,Prediction))
print("__"*50)

____________________________________________________________________________________________________ 

The accuracy of the XGB Regression is 0.5366717465496147
____________________________________________________________________________________________________ 

The F1 score of the XGB Regression is 0.39384513199576315
____________________________________________________________________________________________________ 

              precision    recall  f1-score   support

         1.0       0.54      0.98      0.70     24868
         2.0       0.35      0.04      0.07     14091
         3.0       0.00      0.00      0.00      4331
         4.0       0.00      0.00      0.00      1619
         5.0       0.00      0.00      0.00       750
         6.0       0.00      0.00      0.00       930

   micro avg       0.54      0.54      0.54     46589
   macro avg       0.15      0.17      0.13     46589
weighted avg       0.40      0.54      0.39     46589

_________________________________

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### XGBoost

In [26]:
#import xgboost as xgb
from xgboost import XGBClassifier

# A parameter grid for XGBoost
param_grid = {
        'n_estimators': [150,200,300],
        'min_child_weight': [3,4],
        'gamma':  [0],
        'subsample': [ 0.7, 0.8],
        'colsample_bytree': [ 0.7, 0.8],
        'max_depth': [7,8],
        'learning_rate' : [ 0.05, 0.1]
        }

In [27]:
params=RadomsearchCV(X_train,y_train,XGBClassifier(),param_grid,5,-1)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 80 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 out of  50 | elapsed:  8.3min remaining: 11.5min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 13.8min remaining:   53.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 14.2min finished


In [None]:
model = XGBClassifier(**params)
#model.set_params(**params)
model.fit(X_train,y_train)
Prediction=model.predict(X_valid) 

In [32]:
print("__"*50,"\n")
print('The accuracy of the XGB Regression is',metrics.accuracy_score(y_valid,Prediction))
print("__"*50,"\n")
print('The F1 score of the XGB Regression is',metrics.f1_score(y_valid,Prediction,average='weighted'))
print("__"*50,"\n")
print(classification_report(y_valid,Prediction))
print("__"*50)


____________________________________________________________________________________________________ 

The accuracy of the XGB Regression is 0.5413509626735925
____________________________________________________________________________________________________ 

The F1 score of the XGB Regression is 0.4573480004884017
____________________________________________________________________________________________________ 

              precision    recall  f1-score   support

         1.0       0.58      0.90      0.70     24868
         2.0       0.37      0.20      0.26     14091
         3.0       0.23      0.01      0.01      4331
         4.0       0.25      0.00      0.01      1619
         5.0       0.25      0.00      0.01       750
         6.0       0.31      0.02      0.05       930

   micro avg       0.54      0.54      0.54     46589
   macro avg       0.33      0.19      0.17     46589
weighted avg       0.46      0.54      0.46     46589

__________________________________

In [35]:
test_df['attempts_range']=model.predict(df_scaled_test.drop('attempts_range',axis=1))

In [37]:
test_df[['ID','attempts_range']].to_csv("XGBoost_tuned_Submission.csv",index=False)