# A try out using Compact Prediction Tree

We will be approaching this challenge as a sequence prediction and trying to solve it with the algorithm called Compact Prediction Tree.

We will be benefiting the library CPT (https://github.com/analyticsvidhya/CPT) to perfom the algorithm on our data.

## 1. Data Preparation

In [57]:
import numpy as np
import pandas as pd
import time
from collections import Counter
import sqlite3
from tqdm import tqdm
from PredictionTree import *
from CPT import *

In [58]:
conn = sqlite3.connect('train_data.db')
triplets = pd.read_sql_query("SELECT * FROM triplets", conn)
targets = pd.read_sql_query("SELECT * FROM targets", conn)

In [59]:
triplets[:1]

Unnamed: 0,id,product_1,product_2,product_3,count
0,1,9763,10007,11938,384


In [60]:
targets[:1]

Unnamed: 0,id,product,count,triplet_id
0,1,9664,54,1


In [61]:
triplets.head(1)

Unnamed: 0,id,product_1,product_2,product_3,count
0,1,9763,10007,11938,384


In [62]:
targets.head(1)

Unnamed: 0,id,product,count,triplet_id
0,1,9664,54,1


In [63]:
newtriplets = triplets[['id','product_1','product_2','product_3']]

In [64]:
newtarget = targets
newtarget = newtarget.drop(['id'],axis = 1)
newtarget.rename(columns = {"triplet_id":'id'}, inplace = True)
newtarget.shape

(100000, 3)

In [65]:
df = pd.merge(newtarget, newtriplets, how = 'left', on = ['id'])
df = df[['id','product_1','product_2','product_3','product','count']]
df.rename(columns ={'product': 'next'}, inplace = True)
df = df[['product_1','product_2','product_3','next']]
df.shape

(100000, 4)

In [66]:
df[:1]

Unnamed: 0,product_1,product_2,product_3,next
0,9763,10007,11938,9664


In [67]:
dataset = []
for i,row in df.iterrows():
    r = list(row)
    dataset.append(r)  
    

In [68]:
dataset[:1]

[['9763', '10007', '11938', '9664']]

In [69]:
# buid a dataset required by the model CPT
dataset = [np.array(i) for i in dataset]

In [70]:
# create the train set by taking every 2 records
trainset = dataset[::2]

In [71]:
trainset[:1]

[array(['9763', '10007', '11938', '9664'], dtype='<U5')]

In [72]:
# create the test set by taking every 2 records
testset = dataset[1::2]

In [73]:
testset[:1]

[array(['9763', '10007', '11938', '10129'], dtype='<U5')]

In [74]:
len(trainset)

50000

In [75]:
len(testset)

50000

In [76]:
# true labels
ytest = np.array(df['next'])
ytest = ytest.reshape(len(triplets),10)

In [77]:
trainset = [np.array(i) for i in trainset]

In [78]:
testset = [testset[i][:3] for i in range(len(testset))]
testset = [np.array(i) for i in testset]

## 2. Model buiding

In [79]:
#Creating an object of the CPT Class
modelcpt = CPT() 

In [80]:
start_time = time.time()
modelcpt.train(trainset)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.382154941558838 seconds ---


In [81]:
pred = modelcpt.predict(trainset,testset,2,10)

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [04:11<00:00, 199.00it/s]


In [82]:
len(pred)

50000

In [83]:
# After merging on the data frame "targets', each sequence has been repeated 10 times ( 5 in trainset + 5 in testset)
Pred = pred[::5]

In [84]:
Pred[:1]

[['9763',
  '9664',
  '11988',
  '10129',
  '12236',
  '11936',
  '11987',
  '12225',
  '10209',
  '12224']]

In [85]:
len(Pred)

10000

In [86]:
len(ytest)

10000

In [87]:
acc = [len(np.intersect1d(i,j))/10 for i,j in zip(Pred,ytest)]

In [88]:
acc = sum(acc)/len(triplets)

In [89]:
print("Training accuracy: %.2f" %(acc*100) + '%')

Training accuracy: 67.35%


## 3. Submission

In [90]:
validset = pd.read_csv('test_set_triplets.csv',header =  None)

In [91]:
validset.head(1)

Unnamed: 0,0,1,2,3,4
0,9,10007,11936,11938,289


In [92]:
validset.columns =  triplets.columns

In [93]:
validset.head(1)

Unnamed: 0,id,product_1,product_2,product_3,count
0,9,10007,11936,11938,289


In [94]:
validset1 = validset[['product_1','product_2','product_3']]

In [95]:
val_input = []
for i,row in validset1.iterrows():
    r = [str(i) for i in row]
    val_input.append(r)  

In [96]:
val_input[:2]

[['10007', '11936', '11938'], ['10007', '11938', '11988']]

In [97]:
len(val_input)

1000

In [98]:
val_input = [np.array(i) for i in val_input]

In [99]:
val_input[:1]

[array(['10007', '11936', '11938'], dtype='<U5')]

In [100]:
len(val_input)

1000

In [101]:
len(trainset)

50000

In [102]:
submit = modelcpt.predict(trainset,val_input,2,10)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 189.48it/s]


In [103]:
len(submit)

1000

In [104]:
submit[:1]

[['11988',
  '11987',
  '9664',
  '10129',
  '10007',
  '11983',
  '12223',
  '11973',
  '13504',
  '11982']]

In [105]:
submit = pd.DataFrame(submit)

In [106]:
submit.insert(0,'id',validset.iloc[:,0])

In [107]:
submit.head(1)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9
0,9,11988,11987,9664,10129,10007,11983,12223,11973,13504,11982


In [108]:
submit.to_csv('submission_CPT.csv', header=False, index= None)