In [367]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import log_loss
import numpy as np

In [368]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')

In [369]:
train['Total_Amount_Sum'] = train.groupby('Opportunity_ID')['Total_Amount'].transform('sum')
test['Total_Amount_Sum'] = test.groupby('Opportunity_ID')['Total_Amount'].transform('sum')

train['ASP_sum_per_Opportunity'] = train.groupby('Opportunity_ID')['ASP'].transform('sum')
train['ASP_converted_sum_per_Opportunity'] = train.groupby('Opportunity_ID')['ASP_(converted)'].transform('sum')
train['Convertibility']=train['ASP_converted_sum_per_Opportunity'] / train['ASP_sum_per_Opportunity']
test['ASP_sum_per_Opportunity'] = test.groupby('Opportunity_ID')['ASP'].transform('sum')
test['ASP_converted_sum_per_Opportunity'] = test.groupby('Opportunity_ID')['ASP_(converted)'].transform('sum')
test['Convertibility']=test['ASP_converted_sum_per_Opportunity'] / test['ASP_sum_per_Opportunity']

train['Total_Amount_USD'] = train['Convertibility']*train['Total_Amount']
train['Total_Amount_Sum_USD'] = train['Convertibility']*train['Total_Amount_Sum']
train['Total_Taxable_Amount_USD'] = train['Convertibility']*train['Total_Taxable_Amount']
test['Total_Amount_USD'] = test['Convertibility']*test['Total_Amount']
test['Total_Amount_Sum_USD'] = test['Convertibility']*test['Total_Amount_Sum']
test['Total_Taxable_Amount_USD'] = test['Convertibility']*test['Total_Taxable_Amount']

In [370]:
train['Target'] = (train.Stage=='Closed Won').astype('int')

In [371]:
def calc_smooth_mean(df, by, on, m):
    # Compute the global mean
    mean = df[on].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)
    # Replace each value by the according smoothed mean
    return smooth

ter_smooth = calc_smooth_mean(train, 'Territory', 'Target', 1000)
bill_smooth = calc_smooth_mean(train, 'Billing_Country', 'Target', 300)
deliv_smooth = calc_smooth_mean(train, 'Delivery_Terms', 'Target', 300)
acc_smooth = calc_smooth_mean(train, 'Account_Type', 'Target', 1000)
bur_smooth = calc_smooth_mean(train, 'Bureaucratic_Code', 'Target', 1000)
opp_smooth = calc_smooth_mean(train, 'Opportunity_Type', 'Target', 1000)
source_smooth = calc_smooth_mean(train, 'Source ', 'Target', 1000)
reg_smooth = calc_smooth_mean(train, 'Region', 'Target', 2500)

In [372]:
def hasData(t):
    for x in t:
        if x != 'None':
            return 1
    return 0

In [373]:
train.columns

Index(['ID', 'Region', 'Territory', 'Pricing, Delivery_Terms_Quote_Appr',
       'Pricing, Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source ',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Account_Type', 'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currenc

In [374]:
def set_df(df):
    
    df.Opportunity_Created_Date = pd.to_datetime(df.Opportunity_Created_Date, errors='coerce')
    df.Planned_Delivery_Start_Date = pd.to_datetime(df.Planned_Delivery_Start_Date, errors='coerce')
    df.Planned_Delivery_End_Date = pd.to_datetime(df.Planned_Delivery_End_Date, errors='coerce')
    df.Account_Created_Date = pd.to_datetime(df.Account_Created_Date, errors='coerce')
    df.Last_Modified_Date = pd.to_datetime(df.Last_Modified_Date, errors='coerce')
    
    df['Year_Creation'] = df.Opportunity_Created_Date.dt.year
    df['Month_Creation'] = df.Opportunity_Created_Date.dt.month
    
    df['Year_Delivery'] = df.Planned_Delivery_Start_Date.dt.year
    df['Month_Delivery'] = df.Planned_Delivery_Start_Date.dt.month
    
    df['Days_Passed'] = df.Last_Modified_Date - df.Opportunity_Created_Date
    df.Days_Passed = df.Days_Passed.astype('timedelta64[D]')
    
    df['Wait_Time_Days'] = df.Planned_Delivery_Start_Date - df.Opportunity_Created_Date
    df.Wait_Time_Days = df.Wait_Time_Days.astype('timedelta64[D]')
    
    df['Delivery_Window'] = df.Planned_Delivery_End_Date - df.Planned_Delivery_Start_Date
    df.Delivery_Window = df.Delivery_Window.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Creation'] = df.Opportunity_Created_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Creation = df.Account_LifeSpan_at_Creation.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Modif'] = df.Last_Modified_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Modif = df.Account_LifeSpan_at_Modif.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Deliv'] = df.Planned_Delivery_Start_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Deliv = df.Account_LifeSpan_at_Deliv.astype('timedelta64[D]')
    
    df['Last_Activity_In_Time'] = ((df.Last_Modified_Date > df.Planned_Delivery_Start_Date)&(df.Last_Modified_Date<df.Planned_Delivery_End_Date)).astype('int')
    
    df['Quote_Type'] = (df.Quote_Type == 'Non Binding').astype('int')
    
    df['Delivered_Hot_Season'] = ((((df.Region=='EMEA')|(df.Region=='Americas')|(df.Region=='Japan')|\
                                   ((df.Region=='APAC')&(df.Territory!='Australia')))\
                                      &\
                                 ((df.Delivery_Quarter=='Q2')|(df.Delivery_Quarter=='Q3')))\
                                      |
                                 ((df.Territory=='Australia')&((df.Delivery_Quarter=='Q1')|(df.Delivery_Quarter=='Q4'))))
    
    df['Wait_Delivery_Cmp'] = (df.Delivery_Window / df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['Days_Left_Cmp'] =  df.Wait_Time_Days - df.Days_Passed
    df['Days_Wait_Cmp'] = (df.Delivery_Window / df.Days_Passed).replace([np.inf, -np.inf], 0)

    df['USD_Per_Day_Waited'] = (df.Total_Amount_Sum_USD/df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['USD_Per_Day_Passed'] = (df.Total_Amount_Sum_USD/df.Days_Passed).replace([np.inf, -np.inf], 0)
    df['USD_Per_Account_Day'] = (df.Total_Amount_Sum_USD/df.Account_LifeSpan_at_Creation).replace([np.inf, -np.inf], 0)
    
    df['Total_Products'] = df.groupby('Opportunity_ID')['Region'].transform('count')
    df['Multiple_Products'] = (df.Total_Products>1).astype('int')
    
    df['Product_Price_Mean'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('mean')
    df['Product_Price_Max'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('max')
    df['Product_Price_Std'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('std')
    df['Product_Price_Min'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('min')
    
    df['Total_TRF'] = df.groupby('Opportunity_ID')['TRF'].transform('sum')
    df['Price_Per_TRF'] = (df.Total_Amount_Sum_USD/df.Total_TRF).replace([np.inf, -np.inf], 0)
    
    #reg = pd.get_dummies(df['Region'],drop_first=False)
    #df = pd.concat([df,reg],axis=1)
    
    df.drop_duplicates(subset='Opportunity_ID', inplace=True)
    
    df['Territory'] = df['Territory'].map(ter_smooth)
    df['Billing_Country'] = df['Billing_Country'].map(bill_smooth)
    df['Delivery_Terms'] = df['Delivery_Terms'].map(deliv_smooth)
    df['Account_Type'] = df['Account_Type'].map(acc_smooth)
    df['Bureaucratic_Code'] = df['Bureaucratic_Code'].map(bur_smooth)
    df['Opportunity_Type'] = df['Opportunity_Type'].map(opp_smooth)
    df['Source '] = df['Source '].map(source_smooth)
    df['Region'] = df['Region'].map(reg_smooth)
    
    df['Quote_Type'] = (df.Quote_Type == 'Non Binding').astype('int')
    df['Product_Category_B'] = df.groupby('Opportunity_ID')['Product_Category_B'].transform(hasData)
    df['Price'] = df.groupby('Opportunity_ID')['Price'].transform(hasData)
    df['Size'] = df.groupby('Opportunity_ID')['Size'].transform(hasData)
    df['Product_Type'] = df.groupby('Opportunity_ID')['Product_Type'].transform(hasData)
    df['Brand'] = df.groupby('Opportunity_ID')['Brand'].transform(hasData)
    df['Currency'] = df.groupby('Opportunity_ID')['Currency'].transform(hasData)
    
    return df.drop(columns=['ID', 'Submitted_for_Approval',
       'Account_Created_Date', 'Account_Name', 'Opportunity_Name',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Opportunity_Created_Date', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount',
       'Prod_Category_A', 'Total_Amount_Sum', 'ASP_sum_per_Opportunity',
       'ASP_converted_sum_per_Opportunity',
       'Total_Amount_USD'])

In [375]:
train_set = set_df(train)
train_set = train_set.drop(columns='Stage')
train_set

  res_values = method(rvalues)


Unnamed: 0,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Bureaucratic_Code,Source,Billing_Country,Opportunity_ID,...,USD_Per_Day_Passed,USD_Per_Account_Day,Total_Products,Multiple_Products,Product_Price_Mean,Product_Price_Max,Product_Price_Std,Product_Price_Min,Total_TRF,Price_Per_TRF
0,0.541219,0.599520,1,1,1,1,0.648820,0.572784,0.560551,0,...,31555.787302,34276.113793,1,0,5.964044e+06,5.964044e+06,,5.964044e+06,10,596404.380000
1,0.541219,0.599520,0,0,0,0,0.648820,0.572784,0.560551,1,...,1398.786667,313.521149,1,0,5.455268e+04,5.455268e+04,,5.455268e+04,0,0.000000
2,0.496400,0.597943,0,0,0,0,0.648820,0.493195,0.468980,2,...,283.329730,363.054545,1,0,8.386560e+04,8.386560e+04,,8.386560e+04,0,0.000000
3,0.496400,0.597943,1,0,1,0,0.327612,0.610706,0.468980,3,...,8835.573214,8590.140625,1,0,7.421882e+06,7.421882e+06,,7.421882e+06,14,530134.392857
4,0.496400,0.597943,1,0,1,0,0.327612,0.610706,0.468980,4,...,15901.419643,15459.713542,1,0,1.335719e+07,1.335719e+07,,1.335719e+07,25,534287.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16939,0.541219,0.559332,1,1,0,0,0.648820,0.569955,0.566369,12799,...,3818.132773,528.323023,1,0,4.543578e+05,4.543578e+05,,4.543578e+05,1,454357.800000
16940,0.496400,0.466772,1,0,0,0,0.327612,0.572784,0.566369,12800,...,68593.247588,39799.440299,2,1,1.066625e+07,1.075158e+07,120674.843277,1.058092e+07,40,533312.500000
16942,0.541219,0.599579,1,1,1,1,0.648820,0.493195,0.653383,12801,...,7369.706168,-8071.582946,3,1,1.130022e+05,1.168988e+05,6749.154643,1.052089e+05,0,0.000000
16945,0.496400,0.466772,1,1,1,1,0.648820,0.572784,0.468980,12802,...,48891.601667,51017.323478,1,0,2.346797e+06,2.346797e+06,,2.346797e+06,4,586699.220000


In [377]:
X = train_set.drop(columns=['Target', 'Opportunity_ID', 'Bureaucratic_Code_0_Approved', 
                           'Pricing, Delivery_Terms_Approved', 'Pricing, Delivery_Terms_Quote_Appr', 'Bureaucratic_Code_0_Approval', 
                           'Delivered_Hot_Season', 'Last_Activity_In_Time', 'Quote_Type', 'Total_Products', 'Multiple_Products', 'Size', 
                           'Total_TRF'])
y = train_set.Target

In [378]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

cat = ctb.CatBoostClassifier()

cat.fit(X_train, y_train)
log_loss(y_test, cat.predict_proba(X_test))

Learning rate set to 0.024864
0:	learn: 0.6781386	total: 16.1ms	remaining: 16s
1:	learn: 0.6657761	total: 27.4ms	remaining: 13.7s
2:	learn: 0.6523343	total: 38.7ms	remaining: 12.9s
3:	learn: 0.6400071	total: 47.3ms	remaining: 11.8s
4:	learn: 0.6272640	total: 53.7ms	remaining: 10.7s
5:	learn: 0.6159486	total: 64.2ms	remaining: 10.6s
6:	learn: 0.6049983	total: 70.8ms	remaining: 10s
7:	learn: 0.5943568	total: 79.5ms	remaining: 9.85s
8:	learn: 0.5848766	total: 85.8ms	remaining: 9.44s
9:	learn: 0.5767208	total: 93.3ms	remaining: 9.23s
10:	learn: 0.5680795	total: 99.7ms	remaining: 8.96s
11:	learn: 0.5603510	total: 106ms	remaining: 8.75s
12:	learn: 0.5526366	total: 114ms	remaining: 8.68s
13:	learn: 0.5458517	total: 121ms	remaining: 8.52s
14:	learn: 0.5393643	total: 129ms	remaining: 8.47s
15:	learn: 0.5326747	total: 137ms	remaining: 8.4s
16:	learn: 0.5260374	total: 144ms	remaining: 8.35s
17:	learn: 0.5201306	total: 151ms	remaining: 8.22s
18:	learn: 0.5152640	total: 159ms	remaining: 8.2s
19:	le

0.3098634814048971

In [379]:
test['Target'] = (test['Sales_Contract_No'] != 'None').astype('int')
test_set = set_df(test)

  res_values = method(rvalues)


In [380]:
test_pred = cat.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))
test_set['Pred'] = pd.DataFrame(test_pred)[1].to_list()
test_set.loc[:, ['Target', 'Pred']]

Unnamed: 0,Target,Pred
0,1,0.988936
3,1,0.814084
8,1,0.608677
9,1,0.153429
15,1,0.960075
...,...,...
2545,1,0.992762
2547,1,0.101361
2548,1,0.114697
2549,0,0.842603


In [381]:
log_loss(test_set.Target, test_set.Pred)

0.5038111905857252

In [382]:
f_imp = cat.feature_importances_
f = X_test.columns

for i in range (len(f_imp)):
    print("{: >10}\t\t{: >50}".format(f[i],f_imp[i]))

    Region		                                 2.435996816459352
 Territory		                                 3.161462742456927
Bureaucratic_Code		                                    5.679445203503
   Source 		                                1.4124440318579752
Billing_Country		                                3.3189476785351246
Account_Type		                                 1.156110520842403
Opportunity_Type		                                1.9435297675862258
Delivery_Terms		                                 1.998053584733049
     Brand		                                 3.006177499664949
Product_Type		                                3.1264796766437786
Product_Category_B		                                 2.677661910164011
     Price		                                0.9234071145310944
  Currency		                                1.0453755953152166
Convertibility		                                 4.115231777707362
Total_Amount_Sum_USD		                                1.3014529378866897
Total_T

In [383]:
params_CB = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200]
          #'ctr_border_count':[50,5,10,20,100,200]
            }

#con train

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

cat_tuning = ctb.CatBoostClassifier()

random_search_CB=RandomizedSearchCV(cat_tuning,param_distributions=params_CB,n_iter=50, scoring='neg_log_loss',n_jobs=-1,cv=5,verbose=3)

random_search_CB.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 10.7min finished


0:	learn: 0.6865290	total: 8.01ms	remaining: 8s
1:	learn: 0.6794384	total: 27.8ms	remaining: 13.9s
2:	learn: 0.6731763	total: 37.6ms	remaining: 12.5s
3:	learn: 0.6660529	total: 46.3ms	remaining: 11.5s
4:	learn: 0.6597674	total: 59.2ms	remaining: 11.8s
5:	learn: 0.6535995	total: 87.2ms	remaining: 14.5s
6:	learn: 0.6471924	total: 103ms	remaining: 14.6s
7:	learn: 0.6404858	total: 113ms	remaining: 14s
8:	learn: 0.6338783	total: 120ms	remaining: 13.2s
9:	learn: 0.6280966	total: 132ms	remaining: 13.1s
10:	learn: 0.6231010	total: 139ms	remaining: 12.5s
11:	learn: 0.6175211	total: 150ms	remaining: 12.4s
12:	learn: 0.6127525	total: 161ms	remaining: 12.2s
13:	learn: 0.6067762	total: 169ms	remaining: 11.9s
14:	learn: 0.6013748	total: 178ms	remaining: 11.7s
15:	learn: 0.5959250	total: 186ms	remaining: 11.4s
16:	learn: 0.5909866	total: 198ms	remaining: 11.5s
17:	learn: 0.5858134	total: 206ms	remaining: 11.3s
18:	learn: 0.5808716	total: 228ms	remaining: 11.8s
19:	learn: 0.5764780	total: 237ms	remain

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostClassifier object at 0x1209eba00>,
                   n_iter=50, n_jobs=-1,
                   param_distributions={'border_count': [32, 5, 10, 20, 50, 100,
                                                         200],
                                        'depth': [3, 1, 2, 6, 4, 5, 7, 8, 9,
                                                  10],
                                        'iterations': [250, 100, 500, 1000],
                                        'l2_leaf_reg': [3, 1, 5, 10, 100],
                                        'learning_rate': [0.03, 0.001, 0.01,
                                                          0.1, 0.2, 0.3]},
                   scoring='neg_log_loss', verbose=3)

In [343]:
log_loss(y_test, random_search_CB.predict_proba(X_test))

0.308037372769716

In [344]:
test_pred_CB = random_search_CB.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target', 'Pred']))
test_pred_CB

array([[0.01909773, 0.98090227],
       [0.23021899, 0.76978101],
       [0.49587199, 0.50412801],
       ...,
       [0.81043006, 0.18956994],
       [0.16396591, 0.83603409],
       [0.96448244, 0.03551756]])

In [345]:
test_set['Pred_Tuned'] = pd.DataFrame(test_pred_CB)[1].to_list()
test_set

Unnamed: 0,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Bureaucratic_Code,Source,Billing_Country,Opportunity_ID,...,Total_Products,Multiple_Products,Product_Price_Mean,Product_Price_Max,Product_Price_Std,Product_Price_Min,Total_TRF,Price_Per_TRF,Pred,Pred_Tuned
0,0.541219,0.591170,1,1,1,1,0.648820,0.493195,0.566369,10689,...,3,1,138528.944054,183509.311304,47457.687938,88931.435478,0,0.000000,0.985242,0.980902
3,0.541219,0.591170,1,1,1,1,0.648820,0.493195,0.566369,10690,...,5,1,171425.235575,275263.616386,61249.507259,127207.623904,1,857126.177873,0.788767,0.769781
8,0.496400,0.466772,1,1,0,0,0.648820,0.569955,0.468980,10691,...,1,0,21037.500000,21037.500000,,21037.500000,0,0.000000,0.346224,0.504128
9,0.496400,0.597943,1,1,1,0,0.544701,0.610706,0.468980,10692,...,6,1,361517.750000,456654.000000,85939.160738,228327.000000,6,361517.750000,0.140532,0.240545
15,0.496400,0.597943,0,0,0,0,0.648820,0.569955,0.468980,10693,...,1,0,5752.500000,5752.500000,,5752.500000,0,0.000000,0.963154,0.955252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2545,0.541219,0.561223,1,1,0,0,0.648820,0.544794,0.564714,12364,...,2,1,73875.000000,124740.000000,71933.972850,23010.000000,0,0.000000,0.989166,0.986362
2547,0.496400,0.502022,1,1,0,0,0.648820,0.572784,0.468980,12365,...,1,0,45054.900000,45054.900000,,45054.900000,0,0.000000,0.099091,0.171969
2548,0.496400,0.502022,1,1,1,1,0.648820,0.572784,0.468980,12366,...,1,0,100122.000000,100122.000000,,100122.000000,0,0.000000,0.115809,0.189570
2549,0.496400,0.466772,1,1,0,0,0.648820,0.572784,0.468980,12367,...,1,0,143220.000000,143220.000000,,143220.000000,0,0.000000,0.864607,0.836034


In [346]:
log_loss(test_set.Target, test_set.Pred_Tuned)

0.49843757903183733

In [164]:
final = test_set.loc[:, ['Opportunity_ID', 'Pred']]
final.columns = ['Opportunity_ID', 'Target']
final.loc[:, ['Opportunity_ID', 'Target']].to_csv('sub_cat.csv', index=False)

In [363]:
params_CB_full = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200]
          #'ctr_border_count':[50,5,10,20,100,200]
            }

#con train

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

cat_tuning_full = ctb.CatBoostClassifier()

random_search_CB_full=RandomizedSearchCV(cat_tuning_full,param_distributions=params_CB_full,n_iter=100, scoring='neg_log_loss',n_jobs=-1,cv=5,verbose=3)

random_search_CB_full.fit(X,y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 37.6min finished


0:	learn: 0.6870677	total: 6.46ms	remaining: 3.23s
1:	learn: 0.6804972	total: 12ms	remaining: 2.99s
2:	learn: 0.6748038	total: 17.2ms	remaining: 2.86s
3:	learn: 0.6687066	total: 22.4ms	remaining: 2.78s
4:	learn: 0.6625837	total: 27.6ms	remaining: 2.74s
5:	learn: 0.6571901	total: 32.7ms	remaining: 2.69s
6:	learn: 0.6514954	total: 37.9ms	remaining: 2.67s
7:	learn: 0.6461847	total: 43.2ms	remaining: 2.65s
8:	learn: 0.6411612	total: 48.5ms	remaining: 2.65s
9:	learn: 0.6360949	total: 53.7ms	remaining: 2.63s
10:	learn: 0.6309289	total: 58.8ms	remaining: 2.61s
11:	learn: 0.6263110	total: 64ms	remaining: 2.6s
12:	learn: 0.6215625	total: 69.8ms	remaining: 2.61s
13:	learn: 0.6171101	total: 75.9ms	remaining: 2.63s
14:	learn: 0.6130040	total: 83ms	remaining: 2.68s
15:	learn: 0.6084768	total: 90.3ms	remaining: 2.73s
16:	learn: 0.6037973	total: 103ms	remaining: 2.94s
17:	learn: 0.5990044	total: 112ms	remaining: 2.99s
18:	learn: 0.5947956	total: 117ms	remaining: 2.96s
19:	learn: 0.5907090	total: 122m

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostClassifier object at 0x11ecef670>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'border_count': [32, 5, 10, 20, 50, 100,
                                                         200],
                                        'depth': [3, 1, 2, 6, 4, 5, 7, 8, 9,
                                                  10],
                                        'iterations': [250, 100, 500, 1000],
                                        'l2_leaf_reg': [3, 1, 5, 10, 100],
                                        'learning_rate': [0.03, 0.001, 0.01,
                                                          0.1, 0.2, 0.3]},
                   scoring='neg_log_loss', verbose=3)

In [364]:
test_pred_CB_full = random_search_CB_full.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target', 'Pred', 'Pred_Tuned', 'Pred_Tuned_Full']))
test_pred_CB_full

array([[0.07362727, 0.92637273],
       [0.31156495, 0.68843505],
       [0.49853485, 0.50146515],
       ...,
       [0.83521278, 0.16478722],
       [0.34151319, 0.65848681],
       [0.93933175, 0.06066825]])

In [365]:
test_set['Pred_Tuned_Full'] = pd.DataFrame(test_pred_CB_full)[1].to_list()
test_set

Unnamed: 0,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Bureaucratic_Code,Source,Billing_Country,Opportunity_ID,...,Multiple_Products,Product_Price_Mean,Product_Price_Max,Product_Price_Std,Product_Price_Min,Total_TRF,Price_Per_TRF,Pred,Pred_Tuned,Pred_Tuned_Full
0,0.541219,0.591170,1,1,1,1,0.648820,0.493195,0.566369,10689,...,1,138528.944054,183509.311304,47457.687938,88931.435478,0,0.000000,0.985242,0.980902,0.926373
3,0.541219,0.591170,1,1,1,1,0.648820,0.493195,0.566369,10690,...,1,171425.235575,275263.616386,61249.507259,127207.623904,1,857126.177873,0.788767,0.769781,0.688435
8,0.496400,0.466772,1,1,0,0,0.648820,0.569955,0.468980,10691,...,0,21037.500000,21037.500000,,21037.500000,0,0.000000,0.346224,0.504128,0.501465
9,0.496400,0.597943,1,1,1,0,0.544701,0.610706,0.468980,10692,...,1,361517.750000,456654.000000,85939.160738,228327.000000,6,361517.750000,0.140532,0.240545,0.489299
15,0.496400,0.597943,0,0,0,0,0.648820,0.569955,0.468980,10693,...,0,5752.500000,5752.500000,,5752.500000,0,0.000000,0.963154,0.955252,0.921572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2545,0.541219,0.561223,1,1,0,0,0.648820,0.544794,0.564714,12364,...,1,73875.000000,124740.000000,71933.972850,23010.000000,0,0.000000,0.989166,0.986362,0.956505
2547,0.496400,0.502022,1,1,0,0,0.648820,0.572784,0.468980,12365,...,0,45054.900000,45054.900000,,45054.900000,0,0.000000,0.099091,0.171969,0.183580
2548,0.496400,0.502022,1,1,1,1,0.648820,0.572784,0.468980,12366,...,0,100122.000000,100122.000000,,100122.000000,0,0.000000,0.115809,0.189570,0.164787
2549,0.496400,0.466772,1,1,0,0,0.648820,0.572784,0.468980,12367,...,0,143220.000000,143220.000000,,143220.000000,0,0.000000,0.864607,0.836034,0.658487


In [366]:
log_loss(test_set.Target, test_set.Pred_Tuned_Full)

0.4580930073475851

In [362]:
subm = test_set.loc[:, ['Opportunity_ID', 'Pred_Tuned_Full']]
subm.columns = ['Opportunity_ID', 'Target']
subm.loc[:, ['Opportunity_ID', 'Target']].to_csv('sub_cat.csv', index=False)

In [359]:
random_search_CB_full.best_params_

{'learning_rate': 0.01,
 'l2_leaf_reg': 10,
 'iterations': 500,
 'depth': 6,
 'border_count': 10}

In [221]:
#features = ['Territory', 'Pricing, Delivery_Terms_Quote_Appr',
#       'Pricing, Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
#       'Bureaucratic_Code_0_Approved', 'Bureaucratic_Code', 'Billing_Country',
#       'Opportunity_ID', 'Account_Type', 'Opportunity_Type', 'Delivery_Terms',
#       'Convertibility', 'Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD',
#       'Target', 'Days_Passed', 'Wait_Time_Days', 'Delivery_Window',
#       'Account_LifeSpan_at_Creation', 'Account_LifeSpan_at_Modif',
#       'Account_LifeSpan_at_Deliv', 'Last_Activity_In_Time',
#       'Delivered_Hot_Season', 'Wait_Delivery_Cmp', 'Days_Left_Cmp',
#       'Days_Wait_Cmp', 'USD_Per_Day_Waited', 'USD_Per_Day_Passed',
#       'USD_Per_Account_Day', 'Total_Products', 'Multiple_Products',
#       'Product_Price_Mean', 'Product_Price_Max', 'Product_Price_Std',
#       'Product_Price_Min', 'Total_TRF', 'Price_Per_TRF', 'APAC', 'Americas',
#       'EMEA', 'Japan', 'Middle East', 'Pred', 'Pred_Tuned',
#       'Pred_Tuned_Full']

best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'depth': 9,
 'border_count': 20}

Index(['Territory', 'Pricing, Delivery_Terms_Quote_Appr',
       'Pricing, Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Bureaucratic_Code', 'Billing_Country',
       'Opportunity_ID', 'Account_Type', 'Opportunity_Type', 'Delivery_Terms',
       'Convertibility', 'Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD',
       'Target', 'Days_Passed', 'Wait_Time_Days', 'Delivery_Window',
       'Account_LifeSpan_at_Creation', 'Account_LifeSpan_at_Modif',
       'Account_LifeSpan_at_Deliv', 'Last_Activity_In_Time',
       'Delivered_Hot_Season', 'Wait_Delivery_Cmp', 'Days_Left_Cmp',
       'Days_Wait_Cmp', 'USD_Per_Day_Waited', 'USD_Per_Day_Passed',
       'USD_Per_Account_Day', 'Total_Products', 'Multiple_Products',
       'Product_Price_Mean', 'Product_Price_Max', 'Product_Price_Std',
       'Product_Price_Min', 'Total_TRF', 'Price_Per_TRF', 'APAC', 'Americas',
       'EMEA', 'Japan', 'Middle East', 'Pred', 'Pred_Tuned',
       'Pred_Tuned