In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## Make the date column in the original feature list into an index

In [5]:
data = pd.read_csv('Train.csv', 
                   index_col='date')
data.head()

Unnamed: 0_level_0,acc,PID
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-12-13 11:38:33.226,256787571627,80G0
2019-11-12 11:46:23.755,256789598703,WLIW
2019-11-10 06:12:27.728,256753510223,U1DD
2019-12-20 13:34:17.453,256702652564,T0LZ
2019-11-15 09:16:06.591,256704005298,80G0


In [6]:
data.columns

Index(['acc', 'PID'], dtype='object')

### Columns are just acc for individual accounts, and PID for product categories

### Sample Data View

In [7]:
train = pd.read_csv('Train.csv', 
                   parse_dates = ['date'])
train.head()

Unnamed: 0,date,acc,PID
0,2019-12-13 11:38:33.226,256787571627,80G0
1,2019-11-12 11:46:23.755,256789598703,WLIW
2,2019-11-10 06:12:27.728,256753510223,U1DD
3,2019-12-20 13:34:17.453,256702652564,T0LZ
4,2019-11-15 09:16:06.591,256704005298,80G0


In [8]:
train.columns

Index(['date', 'acc', 'PID'], dtype='object')

In [9]:
data['PID'].unique().shape

(34,)

In [10]:
train['PID'].unique().shape

(34,)

### Splitting into training and validation data

In [11]:
# Train on purchases prior to 2020
local_train = train.loc[train['date'] < '2020-01-01']
# train = train.loc[train['date'] >= '2017-01-01']

# Test locally on just 2020
local_test = train.loc[train['date'] > '2020-01-01']
# local_test = local_test.loc[local_test['date'] >= '2018-09-01']

In [12]:
pids = train['PID'].unique()

### EXploratory Data Analysis steps

In [13]:
pids

array(['80G0', 'WLIW', 'U1DD', 'T0LZ', '6Q4Z', '5YR6', 'P7R7', 'Q9SJ',
       'DJGS', '3USE', 'T2MU', 'YEPH', '49DG', '0ZFO', '2F3O', 'CPEH',
       '93AI', '5OH5', 'AIT3', 'ZDTG', 'YDAG', '7IPS', 'G1PD', 'FXJ0',
       'HQ1F', 'PNYC', 'P4IQ', 'UE6G', '8DYO', 'AWYV', 'NFDD', 'B5DH',
       'XGSZ', 'WZF8'], dtype=object)

In [14]:
dts = pd.date_range('2019-11-01',
                    '2020-02-23',
                    freq="1d")

In [15]:
dts

DatetimeIndex(['2019-11-01', '2019-11-02', '2019-11-03', '2019-11-04',
               '2019-11-05', '2019-11-06', '2019-11-07', '2019-11-08',
               '2019-11-09', '2019-11-10',
               ...
               '2020-02-14', '2020-02-15', '2020-02-16', '2020-02-17',
               '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21',
               '2020-02-22', '2020-02-23'],
              dtype='datetime64[ns]', length=115, freq='D')

In [16]:
len(dts)

115

In [17]:
tr = pd.DataFrame({'datetime':dts})

In [18]:
tr

Unnamed: 0,datetime
0,2019-11-01
1,2019-11-02
2,2019-11-03
3,2019-11-04
4,2019-11-05
...,...
110,2020-02-19
111,2020-02-20
112,2020-02-21
113,2020-02-22


### Reshaping the data

Obtaining a table of all pids, dates and whether or not a product was purchased on a certain day

In [19]:
for pid in pids:
    tr[str(pid)] = 0
    purchases = train.loc[train['PID'] == pid]
    dts = purchases['date'].dt.round('D')
    dates = dts.astype(str).unique()
    tr.loc[tr['datetime'].isin(dates), pid] = 1

In [20]:
tr.head()

Unnamed: 0,datetime,80G0,WLIW,U1DD,T0LZ,6Q4Z,5YR6,P7R7,Q9SJ,DJGS,...,HQ1F,PNYC,P4IQ,UE6G,8DYO,AWYV,NFDD,B5DH,XGSZ,WZF8
0,2019-11-01,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2019-11-02,1,1,1,1,1,1,1,1,1,...,0,0,1,0,0,0,0,0,0,0
2,2019-11-03,1,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2019-11-04,1,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
4,2019-11-05,1,1,1,1,1,1,1,0,1,...,0,0,1,0,0,0,0,0,0,0


### Preprocessing on train dataframe.

`datetime` and `y` features are also created

In [21]:
train = pd.DataFrame({
    'date x PID':np.concatenate([[str(x) + " x " + str(c) 
                                             for c in pids] 
                                            for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in pids] for x in tr['datetime']]),
    'PID':np.concatenate([[str(c) for c in pids] for x in tr['datetime']]),
    'y':tr[pids].values.flatten()
})
train.head()

Unnamed: 0,date x PID,datetime,PID,y
0,2019-11-01 00:00:00 x 80G0,2019-11-01 00:00:00,80G0,1
1,2019-11-01 00:00:00 x WLIW,2019-11-01 00:00:00,WLIW,1
2,2019-11-01 00:00:00 x U1DD,2019-11-01 00:00:00,U1DD,1
3,2019-11-01 00:00:00 x T0LZ,2019-11-01 00:00:00,T0LZ,1
4,2019-11-01 00:00:00 x 6Q4Z,2019-11-01 00:00:00,6Q4Z,1


### Preprocessing Testing and Training Data

In [23]:
dts = pd.date_range('2020-01-01','2020-02-23',
                    freq="1d")
tr = pd.DataFrame({'datetime':dts})

for pid in pids:
    tr[str(pid)] = 0
    purchases = local_test.loc[local_test['PID'] == pid]
    dts = purchases['date'].dt.round('D')
    dates = dts.astype(str).unique()
    tr.loc[tr['datetime'].isin(dates), pid] = 1
    
test = pd.DataFrame({
    'date x PID':np.concatenate([[str(x) + " x " + str(c) 
                                             for c in pids] 
                                            for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in pids] for x in tr['datetime']]),
    'PID':np.concatenate([[str(c) for c in pids] for x in tr['datetime']]),
    'y':tr[pids].values.flatten()
})
local_test.head()

Unnamed: 0,date,acc,PID
61,2020-01-01 05:30:01.403,256701433663,WLIW
242,2020-01-01 17:52:17.837,256701433663,WLIW
301,2020-01-01 13:27:04.937,256793175446,80G0
327,2020-01-01 08:50:00.816,256704708797,WLIW
443,2020-01-01 20:14:51.184,256756918896,WLIW


In [25]:
test

Unnamed: 0,date x PID,datetime,PID,y
0,2020-01-01 00:00:00 x 80G0,2020-01-01 00:00:00,80G0,1
1,2020-01-01 00:00:00 x WLIW,2020-01-01 00:00:00,WLIW,1
2,2020-01-01 00:00:00 x U1DD,2020-01-01 00:00:00,U1DD,1
3,2020-01-01 00:00:00 x T0LZ,2020-01-01 00:00:00,T0LZ,1
4,2020-01-01 00:00:00 x 6Q4Z,2020-01-01 00:00:00,6Q4Z,1
...,...,...,...,...
1831,2020-02-23 00:00:00 x AWYV,2020-02-23 00:00:00,AWYV,0
1832,2020-02-23 00:00:00 x NFDD,2020-02-23 00:00:00,NFDD,0
1833,2020-02-23 00:00:00 x B5DH,2020-02-23 00:00:00,B5DH,0
1834,2020-02-23 00:00:00 x XGSZ,2020-02-23 00:00:00,XGSZ,0


In [26]:
train.head()

Unnamed: 0,date x PID,datetime,PID,y
0,2019-11-01 00:00:00 x 80G0,2019-11-01 00:00:00,80G0,1
1,2019-11-01 00:00:00 x WLIW,2019-11-01 00:00:00,WLIW,1
2,2019-11-01 00:00:00 x U1DD,2019-11-01 00:00:00,U1DD,1
3,2019-11-01 00:00:00 x T0LZ,2019-11-01 00:00:00,T0LZ,1
4,2019-11-01 00:00:00 x 6Q4Z,2019-11-01 00:00:00,6Q4Z,1


In [27]:
train

Unnamed: 0,date x PID,datetime,PID,y
0,2019-11-01 00:00:00 x 80G0,2019-11-01 00:00:00,80G0,1
1,2019-11-01 00:00:00 x WLIW,2019-11-01 00:00:00,WLIW,1
2,2019-11-01 00:00:00 x U1DD,2019-11-01 00:00:00,U1DD,1
3,2019-11-01 00:00:00 x T0LZ,2019-11-01 00:00:00,T0LZ,1
4,2019-11-01 00:00:00 x 6Q4Z,2019-11-01 00:00:00,6Q4Z,1
...,...,...,...,...
3905,2020-02-23 00:00:00 x AWYV,2020-02-23 00:00:00,AWYV,0
3906,2020-02-23 00:00:00 x NFDD,2020-02-23 00:00:00,NFDD,0
3907,2020-02-23 00:00:00 x B5DH,2020-02-23 00:00:00,B5DH,0
3908,2020-02-23 00:00:00 x XGSZ,2020-02-23 00:00:00,XGSZ,0


### Feature Engineering. Obtaining 'datetime' and 'day' features

In [28]:
train['datetime'] = pd.to_datetime(train['datetime'])

In [29]:
type(train['datetime'].dt)

pandas.core.indexes.accessors.DatetimeProperties

In [30]:
train['datetime'].dt.name

'datetime'

In [31]:
train['day'] = train['datetime'].dt.day_name()

In [32]:
train.head()

Unnamed: 0,date x PID,datetime,PID,y,day
0,2019-11-01 00:00:00 x 80G0,2019-11-01,80G0,1,Friday
1,2019-11-01 00:00:00 x WLIW,2019-11-01,WLIW,1,Friday
2,2019-11-01 00:00:00 x U1DD,2019-11-01,U1DD,1,Friday
3,2019-11-01 00:00:00 x T0LZ,2019-11-01,T0LZ,1,Friday
4,2019-11-01 00:00:00 x 6Q4Z,2019-11-01,6Q4Z,1,Friday


In [39]:
train

Unnamed: 0,date x PID,datetime,PID,y,day
0,2019-11-01 00:00:00 x 80G0,2019-11-01,80G0,1,Friday
1,2019-11-01 00:00:00 x WLIW,2019-11-01,WLIW,1,Friday
2,2019-11-01 00:00:00 x U1DD,2019-11-01,U1DD,1,Friday
3,2019-11-01 00:00:00 x T0LZ,2019-11-01,T0LZ,1,Friday
4,2019-11-01 00:00:00 x 6Q4Z,2019-11-01,6Q4Z,1,Friday
...,...,...,...,...,...
3905,2020-02-23 00:00:00 x AWYV,2020-02-23,AWYV,0,Sunday
3906,2020-02-23 00:00:00 x NFDD,2020-02-23,NFDD,0,Sunday
3907,2020-02-23 00:00:00 x B5DH,2020-02-23,B5DH,0,Sunday
3908,2020-02-23 00:00:00 x XGSZ,2020-02-23,XGSZ,0,Sunday


## Catboost model definition and Training

In [40]:
model = CatBoostClassifier(iterations=150, 
                           loss_function='Logloss', 
                           verbose=False) 

x_cols = ['day', 'PID']
cat_cols = ['day', 'PID']

model.fit(train[x_cols], train['y'], cat_features=cat_cols) # Takes about 

<catboost.core.CatBoostClassifier at 0x11c7f0610>

### Initial test with log-loss metric since it is a bit similar to F1

In [41]:
from sklearn.metrics import log_loss
log_loss(train['y'], model.predict_proba(train[x_cols])[:, 1])

0.29639413248255636

In [42]:
log_loss(train['y'], [0 for y in train['y']])

15.997371880098015

In [43]:
test['datetime'] = pd.to_datetime(test['datetime'])
test['day'] = test['datetime'].dt.day_name()

In [45]:
test

Unnamed: 0,date x PID,datetime,PID,y,day
0,2020-01-01 00:00:00 x 80G0,2020-01-01,80G0,1,Wednesday
1,2020-01-01 00:00:00 x WLIW,2020-01-01,WLIW,1,Wednesday
2,2020-01-01 00:00:00 x U1DD,2020-01-01,U1DD,1,Wednesday
3,2020-01-01 00:00:00 x T0LZ,2020-01-01,T0LZ,1,Wednesday
4,2020-01-01 00:00:00 x 6Q4Z,2020-01-01,6Q4Z,1,Wednesday
...,...,...,...,...,...
1831,2020-02-23 00:00:00 x AWYV,2020-02-23,AWYV,0,Sunday
1832,2020-02-23 00:00:00 x NFDD,2020-02-23,NFDD,0,Sunday
1833,2020-02-23 00:00:00 x B5DH,2020-02-23,B5DH,0,Sunday
1834,2020-02-23 00:00:00 x XGSZ,2020-02-23,XGSZ,0,Sunday


In [46]:
log_loss(test['y'], model.predict_proba(test[x_cols])[:, 1])

0.2904685125184966

## Testing with the F1 Score/Metric

In [47]:
from sklearn.metrics import f1_score
f1_score(test['y'], model.predict(test[x_cols]))

0.8727936701156421

In [48]:
test['pred'] = model.predict_proba(test[x_cols])[:,1]
test['gt005'] = (test['pred']>0.005).astype(int)
test.head()

Unnamed: 0,date x PID,datetime,PID,y,day,pred,gt005
0,2020-01-01 00:00:00 x 80G0,2020-01-01,80G0,1,Wednesday,0.989958,1
1,2020-01-01 00:00:00 x WLIW,2020-01-01,WLIW,1,Wednesday,0.989958,1
2,2020-01-01 00:00:00 x U1DD,2020-01-01,U1DD,1,Wednesday,0.989958,1
3,2020-01-01 00:00:00 x T0LZ,2020-01-01,T0LZ,1,Wednesday,0.989958,1
4,2020-01-01 00:00:00 x 6Q4Z,2020-01-01,6Q4Z,1,Wednesday,0.949518,1


In [49]:
f1_score(test['y'], test['gt005'])

0.6324022346368715

In [50]:
test['gt0005'] = (test['pred']>0.0005).astype(int)
f1_score(test['y'], test['gt0005'])

0.6324022346368715

In [51]:
test['gt05'] = (test['pred']>0.05).astype(int)
f1_score(test['y'], test['gt05'])

0.6833400564743849

In [52]:
dts = pd.date_range('2020-02-23 01:00:00',
                    '2020-02-29 23:00:00',
                    freq="1d")
tr = pd.DataFrame({'datetime':dts})

for pid in pids:
    tr[str(pid)] = 0
    
ss = pd.DataFrame({
    'Account X date X PID':np.concatenate([[str(x) + " x " + str(c)  
                                            for x in tr['datetime']for c in pids]]),
    'datetime':np.concatenate([[str(x) for x in tr['datetime']for c in pids]]),
    'PID':np.concatenate([[str(c) for x in tr['datetime']for c in pids]])
})
ss.head()

Unnamed: 0,Account X date X PID,datetime,PID
0,2020-02-23 01:00:00 x 80G0,2020-02-23 01:00:00,80G0
1,2020-02-23 01:00:00 x WLIW,2020-02-23 01:00:00,WLIW
2,2020-02-23 01:00:00 x U1DD,2020-02-23 01:00:00,U1DD
3,2020-02-23 01:00:00 x T0LZ,2020-02-23 01:00:00,T0LZ
4,2020-02-23 01:00:00 x 6Q4Z,2020-02-23 01:00:00,6Q4Z


In [53]:
ss.shape

(238, 3)

In [54]:
ss['datetime'] = pd.to_datetime(ss['datetime'])
ss['day'] = ss['datetime'].dt.day_name()

In [55]:
ss

Unnamed: 0,Account X date X PID,datetime,PID,day
0,2020-02-23 01:00:00 x 80G0,2020-02-23 01:00:00,80G0,Sunday
1,2020-02-23 01:00:00 x WLIW,2020-02-23 01:00:00,WLIW,Sunday
2,2020-02-23 01:00:00 x U1DD,2020-02-23 01:00:00,U1DD,Sunday
3,2020-02-23 01:00:00 x T0LZ,2020-02-23 01:00:00,T0LZ,Sunday
4,2020-02-23 01:00:00 x 6Q4Z,2020-02-23 01:00:00,6Q4Z,Sunday
...,...,...,...,...
233,2020-02-29 01:00:00 x AWYV,2020-02-29 01:00:00,AWYV,Saturday
234,2020-02-29 01:00:00 x NFDD,2020-02-29 01:00:00,NFDD,Saturday
235,2020-02-29 01:00:00 x B5DH,2020-02-29 01:00:00,B5DH,Saturday
236,2020-02-29 01:00:00 x XGSZ,2020-02-29 01:00:00,XGSZ,Saturday


In [56]:
ss['prediction'] = 0

In [57]:
ss

Unnamed: 0,Account X date X PID,datetime,PID,day,prediction
0,2020-02-23 01:00:00 x 80G0,2020-02-23 01:00:00,80G0,Sunday,0
1,2020-02-23 01:00:00 x WLIW,2020-02-23 01:00:00,WLIW,Sunday,0
2,2020-02-23 01:00:00 x U1DD,2020-02-23 01:00:00,U1DD,Sunday,0
3,2020-02-23 01:00:00 x T0LZ,2020-02-23 01:00:00,T0LZ,Sunday,0
4,2020-02-23 01:00:00 x 6Q4Z,2020-02-23 01:00:00,6Q4Z,Sunday,0
...,...,...,...,...,...
233,2020-02-29 01:00:00 x AWYV,2020-02-29 01:00:00,AWYV,Saturday,0
234,2020-02-29 01:00:00 x NFDD,2020-02-29 01:00:00,NFDD,Saturday,0
235,2020-02-29 01:00:00 x B5DH,2020-02-29 01:00:00,B5DH,Saturday,0
236,2020-02-29 01:00:00 x XGSZ,2020-02-29 01:00:00,XGSZ,Saturday,0


In [58]:
ss['prediction'] = model.predict_proba(ss[x_cols])[:, 1] 

In [59]:
ss

Unnamed: 0,Account X date X PID,datetime,PID,day,prediction
0,2020-02-23 01:00:00 x 80G0,2020-02-23 01:00:00,80G0,Sunday,0.987364
1,2020-02-23 01:00:00 x WLIW,2020-02-23 01:00:00,WLIW,Sunday,0.987364
2,2020-02-23 01:00:00 x U1DD,2020-02-23 01:00:00,U1DD,Sunday,0.987364
3,2020-02-23 01:00:00 x T0LZ,2020-02-23 01:00:00,T0LZ,Sunday,0.987364
4,2020-02-23 01:00:00 x 6Q4Z,2020-02-23 01:00:00,6Q4Z,Sunday,0.942591
...,...,...,...,...,...
233,2020-02-29 01:00:00 x AWYV,2020-02-29 01:00:00,AWYV,Saturday,0.040276
234,2020-02-29 01:00:00 x NFDD,2020-02-29 01:00:00,NFDD,Saturday,0.040276
235,2020-02-29 01:00:00 x B5DH,2020-02-29 01:00:00,B5DH,Saturday,0.154886
236,2020-02-29 01:00:00 x XGSZ,2020-02-29 01:00:00,XGSZ,Saturday,0.040276


In [60]:
ss['prediction'] = (ss['prediction']>0.05).astype(int)

In [61]:
ss

Unnamed: 0,Account X date X PID,datetime,PID,day,prediction
0,2020-02-23 01:00:00 x 80G0,2020-02-23 01:00:00,80G0,Sunday,1
1,2020-02-23 01:00:00 x WLIW,2020-02-23 01:00:00,WLIW,Sunday,1
2,2020-02-23 01:00:00 x U1DD,2020-02-23 01:00:00,U1DD,Sunday,1
3,2020-02-23 01:00:00 x T0LZ,2020-02-23 01:00:00,T0LZ,Sunday,1
4,2020-02-23 01:00:00 x 6Q4Z,2020-02-23 01:00:00,6Q4Z,Sunday,1
...,...,...,...,...,...
233,2020-02-29 01:00:00 x AWYV,2020-02-29 01:00:00,AWYV,Saturday,0
234,2020-02-29 01:00:00 x NFDD,2020-02-29 01:00:00,NFDD,Saturday,0
235,2020-02-29 01:00:00 x B5DH,2020-02-29 01:00:00,B5DH,Saturday,1
236,2020-02-29 01:00:00 x XGSZ,2020-02-29 01:00:00,XGSZ,Saturday,0


In [62]:
len(ss['prediction'].unique())

2

In [63]:
ss.shape

(238, 5)

In [64]:
ss[['Account X date X PID', 'prediction']].to_csv('test_submit.csv', index=False)

In [65]:
ss

Unnamed: 0,Account X date X PID,datetime,PID,day,prediction
0,2020-02-23 01:00:00 x 80G0,2020-02-23 01:00:00,80G0,Sunday,1
1,2020-02-23 01:00:00 x WLIW,2020-02-23 01:00:00,WLIW,Sunday,1
2,2020-02-23 01:00:00 x U1DD,2020-02-23 01:00:00,U1DD,Sunday,1
3,2020-02-23 01:00:00 x T0LZ,2020-02-23 01:00:00,T0LZ,Sunday,1
4,2020-02-23 01:00:00 x 6Q4Z,2020-02-23 01:00:00,6Q4Z,Sunday,1
...,...,...,...,...,...
233,2020-02-29 01:00:00 x AWYV,2020-02-29 01:00:00,AWYV,Saturday,0
234,2020-02-29 01:00:00 x NFDD,2020-02-29 01:00:00,NFDD,Saturday,0
235,2020-02-29 01:00:00 x B5DH,2020-02-29 01:00:00,B5DH,Saturday,1
236,2020-02-29 01:00:00 x XGSZ,2020-02-29 01:00:00,XGSZ,Saturday,0


## Reading in test data set to make predictions

In [66]:
sample = pd.read_csv('SampleSubmission.csv')

In [67]:
sample.head()

Unnamed: 0,Account X date X PID,Prediction
0,256786974320 X 2020-02-23 X 5YR6,0
1,256786974320 X 2020-02-23 X WLIW,0
2,256786974320 X 2020-02-23 X U1DD,0
3,256786974320 X 2020-02-23 X T0LZ,0
4,256786974320 X 2020-02-23 X 80G0,0


In [68]:
ss

Unnamed: 0,Account X date X PID,datetime,PID,day,prediction
0,2020-02-23 01:00:00 x 80G0,2020-02-23 01:00:00,80G0,Sunday,1
1,2020-02-23 01:00:00 x WLIW,2020-02-23 01:00:00,WLIW,Sunday,1
2,2020-02-23 01:00:00 x U1DD,2020-02-23 01:00:00,U1DD,Sunday,1
3,2020-02-23 01:00:00 x T0LZ,2020-02-23 01:00:00,T0LZ,Sunday,1
4,2020-02-23 01:00:00 x 6Q4Z,2020-02-23 01:00:00,6Q4Z,Sunday,1
...,...,...,...,...,...
233,2020-02-29 01:00:00 x AWYV,2020-02-29 01:00:00,AWYV,Saturday,0
234,2020-02-29 01:00:00 x NFDD,2020-02-29 01:00:00,NFDD,Saturday,0
235,2020-02-29 01:00:00 x B5DH,2020-02-29 01:00:00,B5DH,Saturday,1
236,2020-02-29 01:00:00 x XGSZ,2020-02-29 01:00:00,XGSZ,Saturday,0


In [111]:
ss['prediction'] = (ss['prediction']>0.05).astype(int)

In [69]:
sample['Prediction'] = (sample['Prediction']>0.00001).astype(int)

In [70]:
sample

Unnamed: 0,Account X date X PID,Prediction
0,256786974320 X 2020-02-23 X 5YR6,0
1,256786974320 X 2020-02-23 X WLIW,0
2,256786974320 X 2020-02-23 X U1DD,0
3,256786974320 X 2020-02-23 X T0LZ,0
4,256786974320 X 2020-02-23 X 80G0,0
...,...,...
111043,256704700785 X 2020-02-29 X P4IQ,0
111044,256704700785 X 2020-02-29 X PNYC,0
111045,256704700785 X 2020-02-29 X YDAG,0
111046,256704700785 X 2020-02-29 X ZDTG,0


In [72]:
sample['Prediction'].unique()[0]

0

In [73]:
sample

Unnamed: 0,Account X date X PID,Prediction
0,256786974320 X 2020-02-23 X 5YR6,0
1,256786974320 X 2020-02-23 X WLIW,0
2,256786974320 X 2020-02-23 X U1DD,0
3,256786974320 X 2020-02-23 X T0LZ,0
4,256786974320 X 2020-02-23 X 80G0,0
...,...,...
111043,256704700785 X 2020-02-29 X P4IQ,0
111044,256704700785 X 2020-02-29 X PNYC,0
111045,256704700785 X 2020-02-29 X YDAG,0
111046,256704700785 X 2020-02-29 X ZDTG,0


In [74]:
sa = sample['Account X date X PID'].str.split('X ', 2)[2][-1]
sa

'U1DD'

In [75]:
sample['PID'] = sample['Account X date X PID'].str.split('X ')[2][-1]
sample['datetime'] = sample['Account X date X PID'].str.split('X ')[2][-2]
sample['acc'] = sample['Account X date X PID'].str.split('X ')[2][-3]


In [76]:
sample.head()

Unnamed: 0,Account X date X PID,Prediction,PID,datetime,acc
0,256786974320 X 2020-02-23 X 5YR6,0,U1DD,2020-02-23,256786974320
1,256786974320 X 2020-02-23 X WLIW,0,U1DD,2020-02-23,256786974320
2,256786974320 X 2020-02-23 X U1DD,0,U1DD,2020-02-23,256786974320
3,256786974320 X 2020-02-23 X T0LZ,0,U1DD,2020-02-23,256786974320
4,256786974320 X 2020-02-23 X 80G0,0,U1DD,2020-02-23,256786974320


In [77]:
sample[['acc','datetime', 'PID']] = sample['Account X date X PID'].str.split(" X ",expand=True,)

In [78]:
sample

Unnamed: 0,Account X date X PID,Prediction,PID,datetime,acc
0,256786974320 X 2020-02-23 X 5YR6,0,5YR6,2020-02-23,256786974320
1,256786974320 X 2020-02-23 X WLIW,0,WLIW,2020-02-23,256786974320
2,256786974320 X 2020-02-23 X U1DD,0,U1DD,2020-02-23,256786974320
3,256786974320 X 2020-02-23 X T0LZ,0,T0LZ,2020-02-23,256786974320
4,256786974320 X 2020-02-23 X 80G0,0,80G0,2020-02-23,256786974320
...,...,...,...,...,...
111043,256704700785 X 2020-02-29 X P4IQ,0,P4IQ,2020-02-29,256704700785
111044,256704700785 X 2020-02-29 X PNYC,0,PNYC,2020-02-29,256704700785
111045,256704700785 X 2020-02-29 X YDAG,0,YDAG,2020-02-29,256704700785
111046,256704700785 X 2020-02-29 X ZDTG,0,ZDTG,2020-02-29,256704700785


In [79]:
sample['datetime'] = pd.to_datetime(sample['datetime'])

In [80]:
sample['day'] = sample['datetime'].dt.day_name()

In [81]:
sample

Unnamed: 0,Account X date X PID,Prediction,PID,datetime,acc,day
0,256786974320 X 2020-02-23 X 5YR6,0,5YR6,2020-02-23,256786974320,Sunday
1,256786974320 X 2020-02-23 X WLIW,0,WLIW,2020-02-23,256786974320,Sunday
2,256786974320 X 2020-02-23 X U1DD,0,U1DD,2020-02-23,256786974320,Sunday
3,256786974320 X 2020-02-23 X T0LZ,0,T0LZ,2020-02-23,256786974320,Sunday
4,256786974320 X 2020-02-23 X 80G0,0,80G0,2020-02-23,256786974320,Sunday
...,...,...,...,...,...,...
111043,256704700785 X 2020-02-29 X P4IQ,0,P4IQ,2020-02-29,256704700785,Saturday
111044,256704700785 X 2020-02-29 X PNYC,0,PNYC,2020-02-29,256704700785,Saturday
111045,256704700785 X 2020-02-29 X YDAG,0,YDAG,2020-02-29,256704700785,Saturday
111046,256704700785 X 2020-02-29 X ZDTG,0,ZDTG,2020-02-29,256704700785,Saturday


### Initial Predictions

In [82]:
ss['Prediction'] = model.predict_proba(ss[x_cols])[:, 1] 

In [84]:
sample['prediction'] = model.predict_proba(sample[x_cols])[:, 1] 

In [85]:
sample

Unnamed: 0,Account X date X PID,Prediction,PID,datetime,acc,day,prediction
0,256786974320 X 2020-02-23 X 5YR6,0,5YR6,2020-02-23,256786974320,Sunday,0.852201
1,256786974320 X 2020-02-23 X WLIW,0,WLIW,2020-02-23,256786974320,Sunday,0.987364
2,256786974320 X 2020-02-23 X U1DD,0,U1DD,2020-02-23,256786974320,Sunday,0.987364
3,256786974320 X 2020-02-23 X T0LZ,0,T0LZ,2020-02-23,256786974320,Sunday,0.987364
4,256786974320 X 2020-02-23 X 80G0,0,80G0,2020-02-23,256786974320,Sunday,0.987364
...,...,...,...,...,...,...,...
111043,256704700785 X 2020-02-29 X P4IQ,0,P4IQ,2020-02-29,256704700785,Saturday,0.271232
111044,256704700785 X 2020-02-29 X PNYC,0,PNYC,2020-02-29,256704700785,Saturday,0.077854
111045,256704700785 X 2020-02-29 X YDAG,0,YDAG,2020-02-29,256704700785,Saturday,0.192952
111046,256704700785 X 2020-02-29 X ZDTG,0,ZDTG,2020-02-29,256704700785,Saturday,0.077854


In [86]:
sample['prediction'] = model.predict_proba(sample[x_cols])[:, 1]

In [87]:
sample

Unnamed: 0,Account X date X PID,Prediction,PID,datetime,acc,day,prediction
0,256786974320 X 2020-02-23 X 5YR6,0,5YR6,2020-02-23,256786974320,Sunday,0.852201
1,256786974320 X 2020-02-23 X WLIW,0,WLIW,2020-02-23,256786974320,Sunday,0.987364
2,256786974320 X 2020-02-23 X U1DD,0,U1DD,2020-02-23,256786974320,Sunday,0.987364
3,256786974320 X 2020-02-23 X T0LZ,0,T0LZ,2020-02-23,256786974320,Sunday,0.987364
4,256786974320 X 2020-02-23 X 80G0,0,80G0,2020-02-23,256786974320,Sunday,0.987364
...,...,...,...,...,...,...,...
111043,256704700785 X 2020-02-29 X P4IQ,0,P4IQ,2020-02-29,256704700785,Saturday,0.271232
111044,256704700785 X 2020-02-29 X PNYC,0,PNYC,2020-02-29,256704700785,Saturday,0.077854
111045,256704700785 X 2020-02-29 X YDAG,0,YDAG,2020-02-29,256704700785,Saturday,0.192952
111046,256704700785 X 2020-02-29 X ZDTG,0,ZDTG,2020-02-29,256704700785,Saturday,0.077854


### Picking Model Thresholds to determine if customer would make purchase or not

In [90]:
sample['Prediction'] = (sample['prediction']>0.99).astype(int)

In [91]:
sample

Unnamed: 0,Account X date X PID,Prediction,PID,datetime,acc,day,prediction
0,256786974320 X 2020-02-23 X 5YR6,0,5YR6,2020-02-23,256786974320,Sunday,0.852201
1,256786974320 X 2020-02-23 X WLIW,0,WLIW,2020-02-23,256786974320,Sunday,0.987364
2,256786974320 X 2020-02-23 X U1DD,0,U1DD,2020-02-23,256786974320,Sunday,0.987364
3,256786974320 X 2020-02-23 X T0LZ,0,T0LZ,2020-02-23,256786974320,Sunday,0.987364
4,256786974320 X 2020-02-23 X 80G0,0,80G0,2020-02-23,256786974320,Sunday,0.987364
...,...,...,...,...,...,...,...
111043,256704700785 X 2020-02-29 X P4IQ,0,P4IQ,2020-02-29,256704700785,Saturday,0.271232
111044,256704700785 X 2020-02-29 X PNYC,0,PNYC,2020-02-29,256704700785,Saturday,0.077854
111045,256704700785 X 2020-02-29 X YDAG,0,YDAG,2020-02-29,256704700785,Saturday,0.192952
111046,256704700785 X 2020-02-29 X ZDTG,0,ZDTG,2020-02-29,256704700785,Saturday,0.077854


### Final Result file (predictions made)

In [94]:
final = sample[['Account X date X PID', 'Prediction']].to_csv('result.csv', index=False)