***Importing required libraries***

In [None]:
! pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import imblearn
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from numpy import sort
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

***Reading the csv files***

In [None]:
df = pd.read_csv("2022_Competition_Training.csv")


  exec(code_obj, self.user_global_ns, self.user_ns)


***Handling Null values***

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns

In [None]:
df[categorical_cols].isna().sum()

cons_mobplus             11150
cms_ra_factor_type_cd     2461
cons_homstat             11159
sex_cd                       0
lang_spoken_cd           18936
rucc_category                0
cms_race_cd                  0
dtype: int64

In [None]:
df['lang_spoken_cd'] = df['lang_spoken_cd'].fillna('ENG')
df['cons_mobplus'] = df['cons_mobplus'].fillna('U')
df['cms_ra_factor_type_cd'] = df['cms_ra_factor_type_cd'].fillna('CN')
df['cons_homstat'] = df['cons_homstat'].fillna('U')

In [None]:
df.shape

(48300, 881)

In [None]:
df1 = df.dropna(thresh=45885, axis = 1)

***Some more data cleaning***

In [None]:
df1 = df1.dropna(axis = 0)
df1 = df1.drop(columns = ['id'])

In [None]:
df1 = df1[df1.cms_race_cd != "*"]
df1 = df1[df1.lang_spoken_cd != "*"]
df1 = df1[df1.cms_ra_factor_type_cd != "*"]
df1 = df1.astype({"cms_race_cd":int})
categorical_cols = df1.select_dtypes(include=['object']).columns

***Splitting the dataset into features and labels***

In [None]:
X1 = df1.drop('hi_flag', axis = 1)
y1 = df1['hi_flag']

#X1 = X1.drop(0, axis = 1)

In [None]:
X1 = X1.reset_index(drop=True) # resetting the indices
y1 = y1.reset_index(drop =True)

***Dealing with the categorical variables***

In [None]:
categorical_cols

Index(['cons_mobplus', 'cms_ra_factor_type_cd', 'cons_homstat', 'sex_cd',
       'lang_spoken_cd', 'rucc_category'],
      dtype='object')

In [None]:
X1 = pd.get_dummies(X1, columns = categorical_cols)

***Automatic feature selection***

In [None]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(RandomForestClassifier(n_estimators = 175, max_depth = 7, min_samples_leaf = 25, n_jobs = -1, bootstrap = True, random_state = 42))
sel.fit(X1, y1)
selected_feat= X1.columns[(sel.get_support())]
len(selected_feat)

185

In [None]:
listcols = list(selected_feat)
listcols.sort()
listcols

['atlas_age65andolderpct2010',
 'atlas_orchard_farms12',
 'atlas_pct_diabetes_adults13',
 'atlas_totalocchu',
 'atlas_totalpopacs',
 'atlas_totalpopest2016',
 'bh_agad_pmpm_ct',
 'bh_cdto_pmpm_ct',
 'bh_dema_pmpm_ct',
 'cci_cpd_pmpm_ct',
 'cci_score',
 'cms_disabled_ind',
 'cms_dual_eligible_ind',
 'cms_low_income_ind',
 'cms_orig_reas_entitle_cd',
 'cms_ra_factor_type_cd_CF',
 'cms_ra_factor_type_cd_CN',
 'cms_ra_factor_type_cd_CP',
 'cms_race_cd',
 'cmsd1_bld_pmpm_ct',
 'cmsd1_cir_pmpm_ct',
 'cmsd1_end_pmpm_ct',
 'cmsd1_eye_pmpm_ct',
 'cmsd1_gus_pmpm_ct',
 'cmsd1_inj_pmpm_ct',
 'cmsd1_men_pmpm_ct',
 'cmsd1_mus_pmpm_ct',
 'cmsd1_ner_pmpm_ct',
 'cmsd1_res_pmpm_ct',
 'cmsd1_sns_pmpm_ct',
 'cmsd1_vco_pmpm_ct',
 'cmsd2_bld_hemorrhagic_pmpm_ct',
 'cmsd2_cir_hypertensive_pmpm_ct',
 'cmsd2_cir_other_heart_disease_pmpm_ct',
 'cmsd2_end_dm_pmpm_ct',
 'cmsd2_end_metabolic_pmpm_ct',
 'cmsd2_end_obese_pmpm_ct',
 'cmsd2_gus_kidney_pmpm_ct',
 'cmsd2_men_mad_ind',
 'cmsd2_men_mad_pmpm_ct',
 'cmsd2_m

***Feature engineering***

In [None]:
X1 = X1[listcols]

In [None]:
X1['avg_people_per_home'] = X1['atlas_totalpopacs']/X1['atlas_totalocchu']

In [None]:
cols_drop = ['atlas_totalpopacs','atlas_totalocchu']

In [None]:
import re
p = re.compile('bh.*ct')
l1 = [ s for s in listcols if p.match(s) ]

In [None]:
for i in l1:
  cols_drop.append(i)

In [None]:
X1['total_bh_claims_mnt'] = X1[l1].sum(axis = 1)

In [None]:
p = re.compile('cmsd1.*ct')
l3 = [ s for s in listcols if p.match(s) ]
for i in l3:
  cols_drop.append(i)
X1['cmsd1_claims_ct'] = X1[l3].sum(axis = 1)

In [None]:
p = re.compile('cmsd2.*ct')
l4 = [ s for s in listcols if p.match(s) ]
for i in l4:
  cols_drop.append(i)
X1['cmsd2_claims_ct'] = X1[l4].sum(axis = 1)

In [None]:
p = re.compile('.*emails_[0-9][01]?')
l5 = [ s for s in listcols if p.match(s) ]
for i in l5:
  cols_drop.append(i)

In [None]:
p = re.compile('.*print_[0-9][01]?')
l6 = [ s for s in listcols if p.match(s) ]
for i in l6:
  cols_drop.append(i)

In [None]:
p = re.compile('.*vat_[0-9][01]?')
l7 = [ s for s in listcols if p.match(s) ]
for i in l7:
  cols_drop.append(i)

In [None]:
p = re.compile('.*webstatement_[0-9][01]?')
l11 = [ s for s in listcols if p.match(s) ]
for i in l11:
  cols_drop.append(i)

In [None]:
p = re.compile('med.*clm')
l8 = [ s for s in listcols if p.match(s) ]
for i in l8:
  cols_drop.append(i)
X1['med_claims_avg_days'] = (X1[l8].sum(axis = 1))/len(l8)

In [None]:
p = re.compile('rev.*ct')
l9 = [ s for s in listcols if p.match(s) ]
for i in l9:
  cols_drop.append(i)
X1['avg_rev_ct'] = (X1[l9].sum(axis = 1))/len(l9)

In [None]:
X1['rx_tier'] = X1['rx_tier_1_pmpm_ct'] + X1['rx_tier_2_pmpm_ct'] + X1['rx_tier_3_pmpm_ct'] + X1['rx_tier_4_pmpm_ct']

In [None]:
cols_drop.append('rx_tier_1_pmpm_ct')
cols_drop.append('rx_tier_2_pmpm_ct')
cols_drop.append('rx_tier_3_pmpm_ct')
cols_drop.append('rx_tier_4_pmpm_ct')

In [None]:
X1.drop(columns = cols_drop, inplace = True)
X1

Unnamed: 0,atlas_age65andolderpct2010,atlas_orchard_farms12,atlas_pct_diabetes_adults13,atlas_totalpopest2016,cci_cpd_pmpm_ct,cci_score,cms_disabled_ind,cms_dual_eligible_ind,cms_low_income_ind,cms_orig_reas_entitle_cd,...,total_physician_office_allowed_pmpm_cost,total_physician_office_ds_clm,total_physician_office_visit_ct_pmpm,avg_people_per_home,total_bh_claims_mnt,cmsd1_claims_ct,cmsd2_claims_ct,med_claims_avg_days,avg_rev_ct,rx_tier
0,17.13,11.0,15.9,36913.0,0.00,4.0,0,0,0,0.0,...,54.46,3,0.33,2.372964,0.00,0.82,0.56,357.666667,0.000000,2.32
1,25.76,58.0,13.4,182835.0,0.00,6.0,0,0,1,0.0,...,175.12,29,1.33,2.492978,0.83,7.75,6.85,188.000000,0.996667,4.57
2,12.76,48.0,11.1,396484.0,0.08,8.0,0,0,0,0.0,...,153.46,85,1.33,2.522333,0.08,7.22,3.71,226.666667,0.000000,2.90
3,13.32,9.0,9.9,269141.0,0.00,2.0,0,1,1,0.0,...,15.47,23,0.08,2.654108,0.00,2.05,1.23,173.333333,0.136667,1.33
4,16.34,14.0,11.7,17983.0,0.00,3.0,1,1,1,1.0,...,27.33,177,0.25,3.406780,0.00,2.40,2.46,246.666667,0.000000,9.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42015,13.13,2.0,12.7,26622.0,0.00,5.0,0,0,0,0.0,...,77.67,63,0.58,36.970752,0.00,2.72,2.44,225.666667,0.166667,1.49
42016,13.50,5.0,15.6,9251.0,0.00,3.0,1,0,0,1.0,...,43.84,44,0.51,1.371897,0.00,1.90,2.50,237.666667,0.053333,1.24
42017,8.14,44.0,8.5,4589928.0,0.00,2.0,1,0,0,1.0,...,59.36,48,0.58,2.886399,0.00,1.08,0.48,242.000000,0.000000,1.73
42018,23.30,23.0,12.8,205249.0,0.00,9.0,0,0,0,0.0,...,60.00,60,0.33,2.478897,0.08,2.47,1.96,204.333333,0.000000,3.48


***RF model***

In [None]:
classifier1 = RandomForestClassifier(n_estimators = 160, max_depth = 20, min_samples_leaf = 50, n_jobs = -1, bootstrap = True, random_state = 42)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state = 42)
lst_accu_stratified = []
for train_index, test_index in skf.split(X1, y1):
    X_train = X1.iloc[train_index, :]
    y_train = y1[train_index]
    X_test = X1.iloc[test_index, :]
    y_test = y1[test_index]
    classifier1.fit(X_train,y_train)
    preds = classifier1.predict_proba(X_test) 
    lst_accu_stratified.append(roc_auc_score(y_test, preds[:, 1]))
from numpy.lib.function_base import average
score1 = average(lst_accu_stratified)

In [None]:
score1

0.7246851864731279

In [None]:
lst_accu_stratified

[0.7474804505135387,
 0.7463089402427638,
 0.7216123949579831,
 0.7112686741363212,
 0.7153653127917834,
 0.7361928104575164,
 0.7275049603174603,
 0.6907635416167973,
 0.7170699012477859,
 0.7332848784493277]

***XGBoost model:***

In [None]:
classifier2 = XGBClassifier(n_estimators = 180, max_depth = 1, learning_rate = 0.18,tree_method = "hist",colsample_bytree = 0.6, scale_pos_weight = 5, n_jobs = -1) 
# fit the model with the training data
skf = StratifiedKFold(n_splits=10, shuffle=True)
lst_accu_stratified = []
for train_index, test_index in skf.split(X1, y1):
    X_train = X1.iloc[train_index, :]
    y_train = y1[train_index]
    X_test = X1.iloc[test_index, :]
    y_test = y1[test_index]
    classifier2.fit(X_train,y_train)
    preds = classifier2.predict_proba(X_test) 
    lst_accu_stratified.append(roc_auc_score(y_test, preds[:, 1]))
from numpy.lib.function_base import average
score2 = average(lst_accu_stratified)
score2

0.7318358533140241

In [None]:
lst_accu_stratified

[0.7407548436041084,
 0.7265931372549019,
 0.7485629668534081,
 0.7281177054154996,
 0.7744499883286647,
 0.7463702147525677,
 0.7101978291316526,
 0.6941205656164724,
 0.7076386078070394,
 0.7415526743759258]

***LightGBM model***

In [None]:
import lightgbm as lgb
classifier3 = lgb.LGBMClassifier(n_estimators = 225, learning_rate=0.25, max_depth = 1, min_child_samples= 20, n_jobs = -1)
skf = StratifiedKFold(n_splits=10, shuffle=True)
lst_accu_stratified = []
for train_index, test_index in skf.split(X1, y1):
    X_train = X1.iloc[train_index, :]
    y_train = y1[train_index]
    X_test = X1.iloc[test_index, :]
    y_test = y1[test_index]
    classifier3.fit(X_train,y_train)
    preds = classifier3.predict_proba(X_test) 
    lst_accu_stratified.append(roc_auc_score(y_test, preds[:, 1]))
from numpy.lib.function_base import average
score3 = average(lst_accu_stratified)
score3

0.7307429544476178

In [None]:
lst_accu_stratified

[0.7364772992530345,
 0.7439892623716153,
 0.753968253968254,
 0.7144286881419234,
 0.7346011321195144,
 0.7057233309990663,
 0.7316336951447245,
 0.7380056027773062,
 0.7331789740621295,
 0.7154233056386107]

***Adaboost***

In [None]:
from sklearn.ensemble import AdaBoostClassifier
classifier4 = AdaBoostClassifier(n_estimators=175, random_state=0)
skf = StratifiedKFold(n_splits=10, shuffle=True)
lst_accu_stratified = []
for train_index, test_index in skf.split(X1, y1):
    X_train = X1.iloc[train_index, :]
    y_train = y1[train_index]
    X_test = X1.iloc[test_index, :]
    y_test = y1[test_index]
    classifier4.fit(X_train,y_train)
    preds = classifier4.predict_proba(X_test) 
    lst_accu_stratified.append(roc_auc_score(y_test, preds[:, 1]))
from numpy.lib.function_base import average
score4 = average(lst_accu_stratified)
score4

0.6998200071935041

***Ensemble***

In [None]:
models = [('Rf',classifier1),('Xgb',classifier2),('Lgb',classifier3),('adaboost',classifier4)]
scores = [score1, score2, score3, score4]

In [None]:
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators=models, voting='soft', weights= scores)
skf = StratifiedKFold(n_splits=10, shuffle=True)
lst_accu_stratified = []
for train_index, test_index in skf.split(X1, y1):
    X_train = X1.iloc[train_index, :]
    y_train = y1[train_index]
    X_test = X1.iloc[test_index, :]
    y_test = y1[test_index]
    ensemble.fit(X_train,y_train)
    preds = ensemble.predict_proba(X_test) 
    lst_accu_stratified.append(roc_auc_score(y_test, preds[:, 1]))
from numpy.lib.function_base import average
score4 = average(lst_accu_stratified)

In [None]:
score4

0.7327508346354643

In [None]:
lst_accu_stratified

[0.7323140055236753,
 0.72657049508403,
 0.7278642378667394,
 0.738777665754477,
 0.744958319751561,
 0.7265347694344716,
 0.7207421262827967,
 0.7356099287194783,
 0.7158055710024771,
 0.7583312269349376]

***Scoring on holdout set***

In [None]:
df_hold = pd.read_csv("2022_Competition_Holdout.csv")

FileNotFoundError: ignored

In [None]:
df_hold.shape

In [None]:
df_hold['lang_spoken_cd'] = df_hold['lang_spoken_cd'].fillna('ENG')
df_hold['cons_mobplus'] = df_hold['cons_mobplus'].fillna('U')
df_hold['cms_ra_factor_type_cd'] = df_hold['cms_ra_factor_type_cd'].fillna('CN')
df_hold['cons_homstat'] = df_hold['cons_homstat'].fillna('U')
df_hold = df_hold[df_hold.lang_spoken_cd != "*"]
df_hold = df_hold[df_hold.cms_ra_factor_type_cd != "*"]
df_hold = df_hold[df_hold.cms_race_cd != "*"]
df_hold = df_hold.astype({"cms_race_cd":int})
df_hold = pd.get_dummies(df_hold, columns = categorical_cols)
listcols.append("id")
df_hold = df_hold[listcols]
df_hold['avg_people_per_home'] = df_hold['atlas_totalpopacs']/df_hold['atlas_totalocchu']
df_hold['total_bh_claims_mnt'] = df_hold[l1].sum(axis = 1)
df_hold['cmsd1_claims_ct'] = df_hold[l3].sum(axis = 1)
df_hold['cmsd2_claims_ct'] = df_hold[l4].sum(axis = 1)
df_hold['med_claims_avg_days'] = (df_hold[l8].sum(axis = 1))/len(l8)
df_hold['avg_rev_ct'] = (df_hold[l9].sum(axis = 1))/len(l9)
df_hold['rx_tier'] = df_hold['rx_tier_1_pmpm_ct'] + df_hold['rx_tier_2_pmpm_ct'] + df_hold['rx_tier_3_pmpm_ct'] + df_hold['rx_tier_4_pmpm_ct']
df_hold.drop(columns = cols_drop, inplace = True)
df_hold.dropna(inplace = True)

In [None]:
df6 = df_hold
ids = df6["id"]
df6.drop(columns = "id", inplace = True)

In [None]:
score1 = ensemble.predict_proba(df6)[:,1]
tuples = list(zip(ids,score1))
df_submit = pd.DataFrame(tuples, columns = ["id","score"])
df_submit['Rank'] = df_submit['score'].rank(ascending=False)
df_submit.to_csv("/content/2022CaseCompetition_Shubhi_Gupta_20221016.csv")

***Feature Importance***

In [None]:
def compute_feature_importance(VotingClassifier, weights):
    """ Function to compute feature importance of Voting Classifier """
    
    feature_importance = dict()
    for est in VotingClassifier.estimators_:
        feature_importance[str(est)] = est.feature_importances_
    
    fe_scores = [0]*len(list(feature_importance.values())[0])
    for idx, imp_score in enumerate(feature_importance.values()):
        imp_score_with_weight = imp_score*weights[idx]
        fe_scores = list(np.add(fe_scores, list(imp_score_with_weight)))
    return fe_scores

In [None]:
dfim = pd.DataFrame()
dfim['Feature'] = X_train.columns
dfim['Feature Importance'] = compute_feature_importance(VotingClassifier, scores)
dfim.sort_values('Feature Importance', ascending=False)

In [None]:
import matplotlib.pyplot as plt

f_i = list(zip(X1.columns, classifier.feature_importances_))
f_i.sort(key = lambda x : x[1], reverse = True)
f_i = f_i[0:19]
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])


plt.show()

In [None]:
list1 = []
list2 = []
for i in f_i:
  list1 = i[0]
  list2.append(list1)

In [None]:
list3 = []
list4 = []
for i in f_i:
  list3 = i[1]
  list4.append(list3)