In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# this makes that each model using the same training set
training = np.random.choice([1,0],7345,p=[0.8,0.2])

In [3]:
model_params_df = pd.DataFrame(columns=['model','alpha','params_keys','params_values','train_acc_score',
                                        'train_roc_auc_score','train_precision_score','train_recall_score',
                                        'test_acc_score','test_roc_auc_score',
                                        'test_precision_score','test_recall_score'])

In [4]:
# the function will take an input of alpha and return the rolled-up data 
def decaying_agg(alpha,training):
    print('Rollup Master Data')
    master = pd.read_csv('master_data.csv',index_col=0)
    firmographic = pd.read_csv('UT_MSBA_FirstAdopterAnalysis_FirmographicData.csv')
    master = master[master['CompanyId'] != -999999]
    master.drop(columns=['USAGE_APP10_SYSCOUNT','USAGE_FEAT10_SYSCOUNT','ProductFamily_B_count','ProductFamily_D_Provider3_count'],inplace=True)
    master.reset_index(drop=True,inplace=True)
    master['YearMonth'] = master['YearMonth'].astype('str')
    master['recent'] = max(master['YearMonth'])
    master[master.columns[2:-1]] = StandardScaler().fit_transform(master[master.columns[2:-1]])
    master['YearMonth'] = pd.to_datetime(master['YearMonth'],format='%Y%m')
    master['recent'] = pd.to_datetime(master['recent'],format='%Y%m')
    master['exponential'] = ((master['YearMonth'] - master['recent'])//np.timedelta64(1, 'M'))
    company_ids = master['CompanyId'].value_counts().index
    denominator = 0
    for i in range(-44,1):
        denominator += alpha**i
    def decaying(alpha, company_id, df):
        row = [company_id]
        df_comp = df[df['CompanyId']==company_id]
        for col in df.columns[2:-2]:
            fin_val = 0
            for val, power in zip(df_comp[col].values, df_comp['exponential'].values):
                fin_val += val*(alpha**power)
            row.append(fin_val/denominator)
        return row
    weighted_master = pd.DataFrame(columns=master.columns)
    weighted_master.drop(columns=['YearMonth','recent', 'exponential'], inplace=True)
    for company_id in company_ids:
        weighted_master.loc[len(weighted_master.index)] = decaying(alpha, company_id, master)
    weighted_master['CompanyId'] = weighted_master['CompanyId'].astype('int64')
    weighted_master['training'] = training
    
    weighted_master = weighted_master.merge(firmographic,on='CompanyId')
    weighted_master = weighted_master.dropna(axis='rows')
    
    return weighted_master

In [37]:
# the function will take the inputs of alpha, rolles-up data, and the scoring of gridsearch 
#    and return nothing but will update model_params_df
def catboost_model_tuning(alpha,i,weighted_master,scoring):
    final_target = pd.read_csv('final_target_file.csv',index_col=0)
    final_target['early_adopter'] = final_target['early_adopter'].astype(int)
    print(final_target['early_adopter'].value_counts())
    final_master = pd.merge(final_target,weighted_master,left_on='CompanyID',right_on='CompanyId')    
    
    train = final_master[final_master['training']==1]
    test = final_master[final_master['training']==0]
    train.drop(columns='training',inplace=True)
    test.drop(columns='training',inplace=True)
    X_train = train.loc[:,train.columns[4:]]
    y_train = train['early_adopter']
    X_test = test.loc[:,test.columns[4:]]
    y_test = test['early_adopter']

    pos_weight = (len(y_train)-sum(y_train))/sum(y_train)
    
    catb_model = CatBoostClassifier(scale_pos_weight=pos_weight, cat_features=['country_name','industry_name'], logging_level='Silent')
    
    grid = {"eval_metric": ['F1','Recall'],
            "learning_rate": [0.01,0.05,0.1],
            "max_depth": [3, 5, 7],
            "n_estimators": [100,200,300]
            }
   
    clf = GridSearchCV(catb_model, grid, n_jobs=1, 
                   cv=StratifiedKFold(n_splits=5, shuffle=True), 
                   scoring=scoring,
                   verbose=2, refit=True)

    clf.fit(X_train, y_train)
    
    print('Best Params List')
    print(clf.best_params_)
    print('\n')
    
    print('Best ROC AUC Score')
    print(clf.best_score_)
    print('\n')
    
    pred_train = clf.predict(X_train)
    print('Train Accuracy Score: ', accuracy_score(y_train,pred_train))
    print('Train ROC_AUC Score: ', roc_auc_score(y_train,clf.predict_proba(X_train)[:,-1]))
    print('\n')
    
    
    pred_test = clf.predict(X_test)
    print('Test Accuracy Score: ', accuracy_score(y_test,pred_test))
    print('Test ROC_AUC Score: ', roc_auc_score(y_test,clf.predict_proba(X_test)[:,-1]))
    
    model_params_df.loc[i,'model'] = 'cat'
    model_params_df.loc[i,'alpha'] = alpha
    model_params_df.loc[i,'params_keys'] = list(clf.best_params_.keys())
    model_params_df.loc[i,'params_values'] = list(clf.best_params_.values())
    model_params_df.loc[i,'train_acc_score'] = accuracy_score(y_train,pred_train)
    model_params_df.loc[i,'train_roc_auc_score'] = roc_auc_score(y_train,clf.predict_proba(X_train)[:,-1])
    model_params_df.loc[i,'train_precision_score'] = precision_score(y_train,pred_train,pos_label=1)
    model_params_df.loc[i,'train_recall_score'] = recall_score(y_train,pred_train,pos_label=1)

    model_params_df.loc[i,'test_acc_score'] = accuracy_score(y_test,pred_test)
    model_params_df.loc[i,'test_roc_auc_score'] = roc_auc_score(y_test,clf.predict_proba(X_test)[:,-1])
    model_params_df.loc[i,'test_precision_score'] = precision_score(y_test,pred_test,pos_label=1)
    model_params_df.loc[i,'test_recall_score'] = recall_score(y_test,pred_test,pos_label=1)

In [15]:
# grid searching
i=0
alphas = [0.75,0.85,0.95,1.05,1.1]
for alpha in alphas:
    weighted_master = decaying_agg(alpha,training)
    catboost_model_tuning(alpha, i, weighted_master=weighted_master, scoring='f1')
    i += 1
    weighted_master = decaying_agg(alpha,training)
    catboost_model_tuning(alpha, i, weighted_master=weighted_master, scoring='recall')
    i += 1
model_params_df.to_csv('catboost_params.csv')

In [66]:
pd.read_csv('catboost_params.csv',index_col=0)

Unnamed: 0.1,Unnamed: 0,model,alpha,params_keys,params_values,train_acc_score,train_roc_auc_score,train_precision_score,train_recall_score,test_acc_score,test_roc_auc_score,test_precision_score,test_recall_score
0,0,cat,0.75,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.05, 3, 200]",0.695675,0.759523,0.482432,0.65625,0.636069,0.666195,0.401617,0.564394
1,1,cat,0.85,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.05, 3, 200]",0.686866,0.746319,0.471342,0.642463,0.62851,0.665608,0.39418,0.564394
0,0,cat,0.95,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.05, 3, 300]",0.739374,0.803485,0.541726,0.700823,0.645542,0.634727,0.39528,0.517375
1,1,cat,1.05,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.1, 3, 100]",0.674953,0.747697,0.461199,0.668801,0.614393,0.625804,0.367725,0.53668
2,2,cat,1.15,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.1, 3, 200]",0.750334,0.842422,0.553392,0.753888,0.628357,0.636991,0.371681,0.486486
3,3,cat,0.75,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.1, 3, 100]",0.702486,0.760639,0.492908,0.635865,0.643394,0.6466,0.391045,0.505792
4,4,cat,0.85,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.1, 3, 200]",0.761294,0.837581,0.571225,0.73376,0.643394,0.644644,0.392962,0.517375
5,5,cat,0.95,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.05, 3, 200]",0.703021,0.753059,0.49345,0.620311,0.644468,0.63664,0.390909,0.498069
6,6,cat,1.05,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.01, 3, 100]",0.585405,0.640928,0.376082,0.635865,0.55102,0.592133,0.331924,0.606178
7,7,cat,1.15,"['eval_metric', 'learning_rate', 'max_depth', ...","['F1', 0.01, 3, 100]",0.567763,0.636417,0.363257,0.63678,0.534909,0.586715,0.31875,0.590734


In [67]:
# choose the best model
weighted_master = decaying_agg(0.75,training)
final_target = pd.read_csv('final_target_file.csv',index_col=0)
final_target['early_adopter'] = final_target['early_adopter'].astype(int)

final_master = pd.merge(final_target,weighted_master,left_on='CompanyID',right_on='CompanyId')    

final_master.drop(columns='training',inplace=True)
y = final_master['early_adopter'].values
X = final_master[final_master.columns[4:]]
   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pos_weight = (len(y_train)-sum(y_train))/sum(y_train)

cat = CatBoostClassifier(scale_pos_weight=pos_weight,
                                cat_features=['country_name','industry_name'],
                                logging_level='Silent',
                                eval_metric='F1',
                                learning_rate=0.05,
                                max_depth=3,
                                n_estimators=200
                                )

cat.fit(X_train, y_train)

Rollup Master Data


<catboost.core.CatBoostClassifier at 0x2a36853d0>

In [68]:
# get feature importances from the best model
feature_importance=cat.get_feature_importance()
feature_names=cat.feature_names_
data={'feature_names':feature_names,'feature_importance':feature_importance}
fi_df = pd.DataFrame(data)
fi_df.sort_values('feature_importance',ascending=False)

Unnamed: 0,feature_names,feature_importance
39,ProductFamily_A_Entry_count,20.708750
35,ProductFamily_A_count,3.900266
50,revenue__total,3.652435
49,employees_total,3.524708
13,USAGE_FEAT4_SYSCOUNT,3.420518
...,...,...
43,ProductFamily_C_Mid_count,0.264431
32,USAGE_VIRT2_SYSCOUNT,0.205342
2,USAGE_APP3_SYSCOUNT,0.174846
65,total_budget_A_C_A_C,0.140050


In [69]:
# print confusion matrix
print(confusion_matrix(y_test,cat.predict(X_test)))

[[448 215]
 [123 149]]


In [72]:
cat.save_model('early_adopter_catboost_firmographic',format='cbm')

### dropping column **ProductFamily_A_Entry_count**

In [58]:
# dropping the top 1 feature and re-build the model
weighted_master = decaying_agg(0.75,training)
final_target = pd.read_csv('final_target_file.csv',index_col=0)
final_target['early_adopter'] = final_target['early_adopter'].astype(int)

final_master = pd.merge(final_target,weighted_master,left_on='CompanyID',right_on='CompanyId')    

final_master.drop(columns=['training','ProductFamily_A_Entry_count'],inplace=True)
y = final_master['early_adopter'].values
X = final_master[final_master.columns[4:]]
   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pos_weight = (len(y_train)-sum(y_train))/sum(y_train)

cat = CatBoostClassifier(scale_pos_weight=pos_weight,
                                cat_features=['country_name','industry_name'],
                                logging_level='Silent',
                                eval_metric='F1',
                                learning_rate=0.05,
                                max_depth=3,
                                n_estimators=200
                                )

cat.fit(X_train, y_train)

Rollup Master Data


In [54]:
# get the feature importances of the model without top 1 feature
feature_importance=cat.get_feature_importance()
feature_names=cat.feature_names_
data={'feature_names':feature_names,'feature_importance':feature_importance}
fi_df = pd.DataFrame(data)
fi_df.sort_values('feature_importance',ascending=False)

Unnamed: 0,feature_names,feature_importance
35,ProductFamily_A_count,11.388012
39,ProductFamily_A_Mid_count,5.880418
48,employees_total,4.157064
8,USAGE_APP9_SYSCOUNT,4.079323
67,country_name,4.011587
...,...,...
51,total_budget_A_A,0.294837
59,total_budget_A_A_A_D,0.172134
55,total_budget_A_A_A,0.114472
56,total_budget_A_A_A_A,0.076895


In [55]:
# get the confusion matrix
print(confusion_matrix(y_test,cat.predict(X_test)))

[[406 257]
 [112 160]]


### dropping columns **ProductFamily_A_Entry_count** and **ProductFamily_A_count**

In [61]:
weighted_master = decaying_agg(0.75,training)
final_target = pd.read_csv('final_target_file.csv',index_col=0)
final_target['early_adopter'] = final_target['early_adopter'].astype(int)

final_master = pd.merge(final_target,weighted_master,left_on='CompanyID',right_on='CompanyId')    

final_master.drop(columns=['training','ProductFamily_A_Entry_count','ProductFamily_A_count'],inplace=True)
y = final_master['early_adopter'].values
X = final_master[final_master.columns[4:]]
   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pos_weight = (len(y_train)-sum(y_train))/sum(y_train)

cat = CatBoostClassifier(scale_pos_weight=pos_weight,
                                cat_features=['country_name','industry_name'],
                                logging_level='Silent',
                                eval_metric='F1',
                                learning_rate=0.05,
                                max_depth=3,
                                n_estimators=200
                                )

cat.fit(X_train, y_train)

Rollup Master Data


<catboost.core.CatBoostClassifier at 0x14522f910>

In [62]:
feature_importance=cat.get_feature_importance()
feature_names=cat.feature_names_
data={'feature_names':feature_names,'feature_importance':feature_importance}
fi_df = pd.DataFrame(data)
fi_df.sort_values('feature_importance',ascending=False)

Unnamed: 0,feature_names,feature_importance
20,USAGE_FEAT12_SYSCOUNT,4.919297
48,revenue__total,4.656049
38,ProductFamily_A_Mid_count,4.297696
66,country_name,4.249360
47,employees_total,4.068230
...,...,...
30,USAGE_PROTOCOL9_SYSCOUNT,0.310644
50,total_budget_A_A,0.279988
65,total_budget_A_C_A_E,0.246756
53,total_budget_A_D,0.175087


In [63]:
print(confusion_matrix(y_test,cat.predict(X_test)))

[[384 279]
 [109 163]]
