In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import log_loss
import numpy as np
import category_encoders as ce

In [76]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')

In [77]:
train['Total_Amount_Sum'] = train.groupby('Opportunity_ID')['Total_Amount'].transform('sum')
test['Total_Amount_Sum'] = test.groupby('Opportunity_ID')['Total_Amount'].transform('sum')

train['ASP_sum_per_Opportunity'] = train.groupby('Opportunity_ID')['ASP'].transform('sum')
train['ASP_converted_sum_per_Opportunity'] = train.groupby('Opportunity_ID')['ASP_(converted)'].transform('sum')
train['Convertibility']=train['ASP_converted_sum_per_Opportunity'] / train['ASP_sum_per_Opportunity']
test['ASP_sum_per_Opportunity'] = test.groupby('Opportunity_ID')['ASP'].transform('sum')
test['ASP_converted_sum_per_Opportunity'] = test.groupby('Opportunity_ID')['ASP_(converted)'].transform('sum')
test['Convertibility']=test['ASP_converted_sum_per_Opportunity'] / test['ASP_sum_per_Opportunity']

train['Total_Amount_USD'] = train['Convertibility']*train['Total_Amount']
train['Total_Amount_Sum_USD'] = train['Convertibility']*train['Total_Amount_Sum']
train['Total_Taxable_Amount_USD'] = train['Convertibility']*train['Total_Taxable_Amount']
test['Total_Amount_USD'] = test['Convertibility']*test['Total_Amount']
test['Total_Amount_Sum_USD'] = test['Convertibility']*test['Total_Amount_Sum']
test['Total_Taxable_Amount_USD'] = test['Convertibility']*test['Total_Taxable_Amount']

In [78]:
train['Target'] = (train.Stage=='Closed Won').astype('int')

In [79]:
features = ['Territory', 'Billing_Country', 'Delivery_Terms', 'Account_Type', 'Bureaucratic_Code', 
            'Opportunity_Type', 'Source ', 'Region']

codif_df = train.drop_duplicates(subset='Opportunity_ID')

encoder = ce.TargetEncoder(cols=features, smoothing = 0.85)
encoder.fit(codif_df[features],codif_df['Target'])

TargetEncoder(cols=['Territory', 'Billing_Country', 'Delivery_Terms',
                    'Account_Type', 'Bureaucratic_Code', 'Opportunity_Type',
                    'Source ', 'Region'],
              smoothing=0.85)

In [80]:
def hasData(t):
    for x in t:
        if x != 'None':
            return 1
    return 0

In [81]:
train.columns

Index(['ID', 'Region', 'Territory', 'Pricing, Delivery_Terms_Quote_Appr',
       'Pricing, Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source ',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Account_Type', 'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currenc

In [82]:
def set_df(df):
    
    df.Opportunity_Created_Date = pd.to_datetime(df.Opportunity_Created_Date, errors='coerce')
    df.Planned_Delivery_Start_Date = pd.to_datetime(df.Planned_Delivery_Start_Date, errors='coerce')
    df.Planned_Delivery_End_Date = pd.to_datetime(df.Planned_Delivery_End_Date, errors='coerce')
    df.Account_Created_Date = pd.to_datetime(df.Account_Created_Date, errors='coerce')
    df.Last_Modified_Date = pd.to_datetime(df.Last_Modified_Date, errors='coerce')
    
    df['Year_Creation'] = df.Opportunity_Created_Date.dt.year
    df['Month_Creation'] = df.Opportunity_Created_Date.dt.month
    
    df['Year_Delivery'] = df.Planned_Delivery_Start_Date.dt.year
    df['Month_Delivery'] = df.Planned_Delivery_Start_Date.dt.month
    
    df['Days_Passed'] = df.Last_Modified_Date - df.Opportunity_Created_Date
    df.Days_Passed = df.Days_Passed.astype('timedelta64[D]')
    
    df['Wait_Time_Days'] = df.Planned_Delivery_Start_Date - df.Opportunity_Created_Date
    df.Wait_Time_Days = df.Wait_Time_Days.astype('timedelta64[D]')
    
    df['Delivery_Window'] = df.Planned_Delivery_End_Date - df.Planned_Delivery_Start_Date
    df.Delivery_Window = df.Delivery_Window.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Creation'] = df.Opportunity_Created_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Creation = df.Account_LifeSpan_at_Creation.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Modif'] = df.Last_Modified_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Modif = df.Account_LifeSpan_at_Modif.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Deliv'] = df.Planned_Delivery_Start_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Deliv = df.Account_LifeSpan_at_Deliv.astype('timedelta64[D]')
    
    df['Quote_Type'] = (df.Quote_Type == 'Non Binding').astype('int')
    
    df['Wait_Delivery_Cmp'] = (df.Delivery_Window / df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['Days_Left_Cmp'] =  df.Wait_Time_Days - df.Days_Passed
    df['Days_Wait_Cmp'] = (df.Delivery_Window / df.Days_Passed).replace([np.inf, -np.inf], 0)

    df['USD_Per_Day_Waited'] = (df.Total_Amount_Sum_USD/df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['USD_Per_Day_Passed'] = (df.Total_Amount_Sum_USD/df.Days_Passed).replace([np.inf, -np.inf], 0)
    df['USD_Per_Account_Day'] = (df.Total_Amount_Sum_USD/df.Account_LifeSpan_at_Creation).replace([np.inf, -np.inf], 0)
    
    df['Total_Products'] = df.groupby('Opportunity_ID')['Region'].transform('count')
    df['Multiple_Products'] = (df.Total_Products>1).astype('int')
    
    df['Product_Price_Mean'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('mean')
    df['Product_Price_Max'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('max')
    df['Product_Price_Std'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('std')
    df['Product_Price_Min'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('min')
    
    df['Total_TRF'] = df.groupby('Opportunity_ID')['TRF'].transform('sum')
    df['Price_Per_TRF'] = (df.Total_Amount_Sum_USD/df.Total_TRF).replace([np.inf, -np.inf], 0)
    
    df.drop_duplicates(subset='Opportunity_ID', inplace=True)
    
    df = df.join(encoder.transform(df[features]).add_suffix('_target'))
    
    df['Product_Category_B'] = df.groupby('Opportunity_ID')['Product_Category_B'].transform(hasData)
    df['Price'] = df.groupby('Opportunity_ID')['Price'].transform(hasData)
    df['Size'] = df.groupby('Opportunity_ID')['Size'].transform(hasData)
    df['Product_Type'] = df.groupby('Opportunity_ID')['Product_Type'].transform(hasData)
    df['Brand'] = df.groupby('Opportunity_ID')['Brand'].transform(hasData)
    df['Currency'] = df.groupby('Opportunity_ID')['Currency'].transform(hasData)
    
    df['Or'] = ((df.Product_Category_B)|(df.Price)|(df.Size)|(df.Product_Type)|(df.Brand)|(df.Currency)).astype('int')
    
    return df.drop(columns=['ID', 'Submitted_for_Approval',
       'Account_Created_Date', 'Account_Name', 'Opportunity_Name',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Opportunity_Created_Date', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount',
       'Prod_Category_A', 'Total_Amount_Sum', 'ASP_sum_per_Opportunity',
       'ASP_converted_sum_per_Opportunity',
       'Total_Amount_USD', 'Territory', 'Billing_Country', 'Delivery_Terms', 
       'Account_Type', 'Bureaucratic_Code', 'Opportunity_Type', 'Source ', 'Region', 
       'Product_Category_B', 'Price', 'Size', 'Product_Type', 'Brand', 'Currency'])

In [83]:
train_set = set_df(train)
train_set = train_set.drop(columns='Stage')
train_set

Unnamed: 0,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Opportunity_ID,Quote_Type,Convertibility,Total_Amount_Sum_USD,Total_Taxable_Amount_USD,Target,...,Price_Per_TRF,Territory_target,Billing_Country_target,Delivery_Terms_target,Account_Type_target,Bureaucratic_Code_target,Opportunity_Type_target,Source _target,Region_target,Or
0,1,1,1,1,0,1,1.131096,5.964044e+06,5.964044e+06,0,...,596404.380000,0.531334,0.477157,0.479354,0.484398,0.593111,0.517054,0.509776,0.462820,0
1,0,0,0,0,1,1,1.131094,5.455268e+04,5.455268e+04,1,...,0.000000,0.531334,0.477157,0.479354,0.484398,0.593111,0.517054,0.509776,0.462820,0
2,0,0,0,0,2,1,1.000000,8.386560e+04,8.386560e+04,1,...,0.000000,0.629841,0.450638,0.543384,0.296500,0.593111,0.517054,0.438859,0.444896,0
3,1,0,1,0,3,1,1.000000,7.421882e+06,7.421882e+06,0,...,530134.392857,0.629841,0.450638,0.445609,0.296500,0.266444,0.156954,0.632822,0.444896,1
4,1,0,1,0,4,1,1.000000,1.335719e+07,1.335719e+07,0,...,534287.700000,0.629841,0.450638,0.445609,0.296500,0.266444,0.156954,0.632822,0.444896,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16939,1,1,0,0,12799,1,1.131087,4.543578e+05,4.543578e+05,1,...,454357.800000,0.368421,0.512232,0.479354,0.681871,0.593111,0.517054,0.556701,0.462820,0
16940,1,0,0,0,12800,1,1.000000,2.133250e+07,2.133250e+07,0,...,533312.500000,0.347490,0.512232,0.543384,0.296500,0.266444,0.156954,0.509776,0.444896,0
16942,1,1,1,1,12801,1,1.131096,3.390065e+05,3.390065e+05,1,...,0.000000,0.805556,0.788732,0.479354,0.296500,0.593111,0.517054,0.438859,0.462820,0
16945,1,1,1,1,12802,1,1.000000,2.346797e+06,0.000000e+00,0,...,586699.220000,0.347490,0.450638,0.543384,0.296500,0.593111,0.517054,0.509776,0.444896,0


In [84]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [85]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

cat = ctb.CatBoostClassifier()

cat.fit(X_train, y_train)
log_loss(y_test, cat.predict_proba(X_test))

Learning rate set to 0.024864
0:	learn: 0.6779140	total: 17.5ms	remaining: 17.5s
1:	learn: 0.6640938	total: 29.2ms	remaining: 14.6s
2:	learn: 0.6501928	total: 41.3ms	remaining: 13.7s
3:	learn: 0.6387142	total: 47.5ms	remaining: 11.8s
4:	learn: 0.6274552	total: 54.5ms	remaining: 10.8s
5:	learn: 0.6165973	total: 60.8ms	remaining: 10.1s
6:	learn: 0.6068336	total: 66.9ms	remaining: 9.49s
7:	learn: 0.5966659	total: 73.1ms	remaining: 9.06s
8:	learn: 0.5882164	total: 79.2ms	remaining: 8.72s
9:	learn: 0.5791328	total: 85.3ms	remaining: 8.45s
10:	learn: 0.5696668	total: 91.5ms	remaining: 8.23s
11:	learn: 0.5619318	total: 97.7ms	remaining: 8.04s
12:	learn: 0.5544937	total: 104ms	remaining: 7.91s
13:	learn: 0.5465497	total: 110ms	remaining: 7.78s
14:	learn: 0.5391815	total: 117ms	remaining: 7.65s
15:	learn: 0.5320219	total: 123ms	remaining: 7.54s
16:	learn: 0.5259328	total: 129ms	remaining: 7.46s
17:	learn: 0.5200103	total: 135ms	remaining: 7.38s
18:	learn: 0.5142683	total: 141ms	remaining: 7.3s


0.30545073024299546

In [86]:
test['Target'] = (test['Sales_Contract_No'] != 'None').astype('int')
test_set = set_df(test)

In [87]:
test_pred = cat.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))
#test_set['Pred'] = pd.DataFrame(test_pred)[1].to_list()
#test_set.loc[:, ['Target', 'Pred']]

In [88]:
log_loss(test_set.Target, test_pred)

0.5013243782033243

In [89]:
f_imp = cat.feature_importances_
f = X_test.columns
fts = ['Opportunity_ID', 'Target']

for i in range (len(f_imp)):
    print("{: >10}\t\t{: >50}".format(f[i],f_imp[i]))
    if f_imp[i]>=1:
        fts.append(f[i])

Pricing, Delivery_Terms_Quote_Appr		                                0.1824342082635333
Pricing, Delivery_Terms_Approved		                                0.5930953118715225
Bureaucratic_Code_0_Approval		                                0.5564323105788068
Bureaucratic_Code_0_Approved		                               0.17322951472722847
Quote_Type		                               0.03483130482803988
Convertibility		                                3.9424416200604133
Total_Amount_Sum_USD		                                1.3075333168443248
Total_Taxable_Amount_USD		                                 2.534598689879157
Year_Creation		                                 1.662587234934151
Month_Creation		                                1.7495076628672201
Year_Delivery		                                1.9707279992807576
Month_Delivery		                                 1.546024321287317
Days_Passed		                                  4.87789584289319
Wait_Time_Days		                                  8.1794

In [90]:
train_filtered = train_set.loc[:, fts]

X = train_filtered.drop(columns=['Opportunity_ID', 'Target'])
y = train_filtered.Target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

params_CB = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200]
          #'ctr_border_count':[50,5,10,20,100,200]
            }

#con train

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

cat_tuning = ctb.CatBoostClassifier()

random_search_CB=RandomizedSearchCV(cat_tuning,param_distributions=params_CB,n_iter=50, scoring='neg_log_loss',n_jobs=-1,cv=5,verbose=3)

random_search_CB.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  7.0min finished


0:	learn: 0.6870057	total: 16.5ms	remaining: 16.5s
1:	learn: 0.6799319	total: 27.9ms	remaining: 13.9s
2:	learn: 0.6731662	total: 39ms	remaining: 13s
3:	learn: 0.6668308	total: 49.9ms	remaining: 12.4s
4:	learn: 0.6601291	total: 61.1ms	remaining: 12.2s
5:	learn: 0.6532941	total: 73.5ms	remaining: 12.2s
6:	learn: 0.6475135	total: 89ms	remaining: 12.6s
7:	learn: 0.6408097	total: 108ms	remaining: 13.4s
8:	learn: 0.6346654	total: 122ms	remaining: 13.4s
9:	learn: 0.6299016	total: 135ms	remaining: 13.4s
10:	learn: 0.6236555	total: 146ms	remaining: 13.1s
11:	learn: 0.6180641	total: 157ms	remaining: 13s
12:	learn: 0.6129204	total: 170ms	remaining: 12.9s
13:	learn: 0.6077906	total: 183ms	remaining: 12.9s
14:	learn: 0.6024860	total: 194ms	remaining: 12.8s
15:	learn: 0.5974153	total: 206ms	remaining: 12.7s
16:	learn: 0.5925109	total: 219ms	remaining: 12.7s
17:	learn: 0.5882975	total: 236ms	remaining: 12.9s
18:	learn: 0.5835494	total: 256ms	remaining: 13.2s
19:	learn: 0.5786894	total: 267ms	remainin

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostClassifier object at 0x1220cc8e0>,
                   n_iter=50, n_jobs=-1,
                   param_distributions={'border_count': [32, 5, 10, 20, 50, 100,
                                                         200],
                                        'depth': [3, 1, 2, 6, 4, 5, 7, 8, 9,
                                                  10],
                                        'iterations': [250, 100, 500, 1000],
                                        'l2_leaf_reg': [3, 1, 5, 10, 100],
                                        'learning_rate': [0.03, 0.001, 0.01,
                                                          0.1, 0.2, 0.3]},
                   scoring='neg_log_loss', verbose=3)

In [91]:
log_loss(y_test, random_search_CB.predict_proba(X_test))

0.31147681987905507

In [92]:
test_filtered = test_set.loc[:, fts]

In [93]:
test_pred_CB = random_search_CB.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))

In [71]:
#test_set['Pred_Tuned'] = pd.DataFrame(test_pred_CB)[1].to_list()
#test_set

In [94]:
log_loss(test_set.Target, test_pred_CB)

0.4798298944157255

In [95]:
#final = test_set.loc[:, ['Opportunity_ID', 'Pred']]
#final.columns = ['Opportunity_ID', 'Target']
#final.loc[:, ['Opportunity_ID', 'Target']].to_csv('sub_cat.csv', index=False)

In [96]:
params_CB_full = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200]
          #'ctr_border_count':[50,5,10,20,100,200]
            }

#con train

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

cat_tuning_full = ctb.CatBoostClassifier()

random_search_CB_full=RandomizedSearchCV(cat_tuning_full,param_distributions=params_CB_full,n_iter=100, scoring='neg_log_loss',n_jobs=-1,cv=5,verbose=3)

random_search_CB_full.fit(X,y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 17.8min finished


0:	learn: 0.6873265	total: 11.6ms	remaining: 5.77s
1:	learn: 0.6817625	total: 19.4ms	remaining: 4.84s
2:	learn: 0.6757430	total: 27.3ms	remaining: 4.52s
3:	learn: 0.6695860	total: 35.7ms	remaining: 4.42s
4:	learn: 0.6628372	total: 43.4ms	remaining: 4.29s
5:	learn: 0.6568504	total: 51.1ms	remaining: 4.21s
6:	learn: 0.6513611	total: 59.9ms	remaining: 4.22s
7:	learn: 0.6457376	total: 68.1ms	remaining: 4.19s
8:	learn: 0.6402094	total: 78.1ms	remaining: 4.26s
9:	learn: 0.6348616	total: 89.1ms	remaining: 4.37s
10:	learn: 0.6302337	total: 99.8ms	remaining: 4.43s
11:	learn: 0.6255365	total: 108ms	remaining: 4.41s
12:	learn: 0.6210436	total: 116ms	remaining: 4.36s
13:	learn: 0.6159009	total: 125ms	remaining: 4.33s
14:	learn: 0.6116521	total: 133ms	remaining: 4.29s
15:	learn: 0.6077782	total: 141ms	remaining: 4.26s
16:	learn: 0.6030908	total: 150ms	remaining: 4.25s
17:	learn: 0.5988821	total: 158ms	remaining: 4.22s
18:	learn: 0.5948966	total: 165ms	remaining: 4.19s
19:	learn: 0.5913383	total: 17

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostClassifier object at 0x121be7d60>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'border_count': [32, 5, 10, 20, 50, 100,
                                                         200],
                                        'depth': [3, 1, 2, 6, 4, 5, 7, 8, 9,
                                                  10],
                                        'iterations': [250, 100, 500, 1000],
                                        'l2_leaf_reg': [3, 1, 5, 10, 100],
                                        'learning_rate': [0.03, 0.001, 0.01,
                                                          0.1, 0.2, 0.3]},
                   scoring='neg_log_loss', verbose=3)

In [97]:
test_pred_CB_full = random_search_CB_full.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))
test_pred_CB_full

array([[0.08134506, 0.91865494],
       [0.36898257, 0.63101743],
       [0.5975419 , 0.4024581 ],
       ...,
       [0.83749319, 0.16250681],
       [0.38535611, 0.61464389],
       [0.93592345, 0.06407655]])

In [98]:
#test_set['Pred_Tuned_Full'] = pd.DataFrame(test_pred_CB_full)[1].to_list()
#test_set

In [100]:
log_loss(test_set.Target, test_pred_CB_full)

0.4585537384629772

In [362]:
subm = test_set.loc[:, ['Opportunity_ID', 'Pred_Tuned_Full']]
subm.columns = ['Opportunity_ID', 'Target']
subm.loc[:, ['Opportunity_ID', 'Target']].to_csv('sub_cat.csv', index=False)

In [102]:
random_search_CB_full.best_params_

{'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'depth': 7,
 'border_count': 200}

In [103]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'depth': 7,
 'border_count': 200}

In [104]:
random_search_CB_full.best_score_

-0.393195058285038

In [105]:
test_set['Prediction'] = pd.DataFrame(test_pred_CB_full)[1].to_list()

In [108]:
test_set.loc[:, ['Opportunity_ID', 'Target', 'Prediction']]

Unnamed: 0,Opportunity_ID,Target,Prediction
0,10689,1,0.918655
3,10690,1,0.631017
8,10691,1,0.402458
9,10692,1,0.475270
15,10693,1,0.933088
...,...,...,...
2545,12364,1,0.949904
2547,12365,1,0.182283
2548,12366,1,0.162507
2549,12367,0,0.614644


In [109]:
test_set['Prediction_Aux'] = 1 - test_set.Prediction

In [111]:
test_set.loc[:, ['Opportunity_ID', 'Target', 'Prediction', 'Prediction_Aux']]

Unnamed: 0,Opportunity_ID,Target,Prediction,Prediction_Aux
0,10689,1,0.918655,0.081345
3,10690,1,0.631017,0.368983
8,10691,1,0.402458,0.597542
9,10692,1,0.475270,0.524730
15,10693,1,0.933088,0.066912
...,...,...,...,...
2545,12364,1,0.949904,0.050096
2547,12365,1,0.182283,0.817717
2548,12366,1,0.162507,0.837493
2549,12367,0,0.614644,0.385356


In [113]:
log_loss(test_set.Target, test_set.Prediction_Aux)

1.9721995873391562

In [114]:
submission = test_set.loc[:, ['Opportunity_ID', 'Prediction']]
submission.columns = ['Opportunity_ID', 'Target']

In [115]:
submission.to_csv('sub_cat.csv', index=False)