# A try out using Association Rules

In order to solve the problem, we propose a method approaching association rules mining.

This method is based on identifying item sequences that occurs together and the similarity of these sequences as well. 

## 1. Data Preparation

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import time
import sqlite3
from Levenshtein import distance as levenshtein_distance
import editdistance
from scipy.spatial.distance import pdist, squareform
import multiprocessing
from joblib import Parallel, delayed

In [2]:
conn = sqlite3.connect('train_data.db')
triplets = pd.read_sql_query("SELECT * FROM triplets", conn)
targets = pd.read_sql_query("SELECT * FROM targets", conn)

In [3]:
triplets.head(1)

Unnamed: 0,id,product_1,product_2,product_3,count
0,1,9763,10007,11938,384


In [4]:
triplets.shape

(10000, 5)

In [42]:
triplets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
id           10000 non-null int64
product_1    10000 non-null object
product_2    10000 non-null object
product_3    10000 non-null object
count        10000 non-null int64
dtypes: int64(2), object(3)
memory usage: 390.8+ KB


In [5]:
targets.head(1)

Unnamed: 0,id,product,count,triplet_id
0,1,9664,54,1


In [6]:
targets.shape

(100000, 4)

Check out products and targets

In [7]:
p1,p2,p3 = np.array(triplets['product_1']), np.array(triplets['product_2']), np.array(triplets['product_3'])
p = np.unique(np.concatenate((p1,p2,p3), axis = 0))
print("Number of the viewed products together: %d" % len(p))

Number of the viewed products together: 698


In [8]:
t = np.array(targets["product"])
target = np.unique(t)
print("Number of the viewed products after a triplet: %d" % len(target))

Number of the viewed products after a triplet: 1476


In [9]:
intersection = np.intersect1d(p, target)
print("Number of the products viewed both in triplets and afterward: %d" % len(intersection))

Number of the products viewed both in triplets and afterward: 675


In [10]:
prod = np.concatenate((target,p), axis=0)
prod = np.unique(prod)
print("Number of all products: %d" % len(prod))

Number of all products: 1499


In [11]:
tr = triplets.loc[:, ['id','count']]

In [12]:
tr.rename(columns={'id': 'triplet_id', 'count': 'count_y'}, inplace=True)

In [13]:
tr.head(1)

Unnamed: 0,triplet_id,count_y
0,1,384


In [14]:
 newtargets = targets

In [15]:
newtargets.head(1)

Unnamed: 0,id,product,count,triplet_id
0,1,9664,54,1


In [16]:
newtargets = pd.merge(newtargets, tr, how = 'left', on = ['triplet_id'])

In [17]:
# add a column to store confidence compute
newtargets['conf'] = newtargets['count']/newtargets['count_y']

In [18]:
newtargets.head(2)

Unnamed: 0,id,product,count,triplet_id,count_y,conf
0,1,9664,54,1,384,0.140625
1,2,10129,47,1,384,0.122396


In [19]:
len(newtargets)

100000

In [20]:
# subset the columns containing 3 products from the data frame 'triplets' and convert to array type
p_array = np.array(triplets.iloc[:,1:-1])   

In [21]:
# use the above result to compute a similarity matrix for every pair of sequences
# use Levenshtein's method to compute the similarity between sequences
distance_matrix = pdist(p_array.tolist(),lambda x,y: editdistance.eval(x,y))
matrix = squareform(distance_matrix)
print(matrix.shape)

(10000, 10000)


In [22]:
matrix2df = pd.DataFrame(matrix, columns = triplets['id'], index = triplets['id'])
matrix2df.head(1)

id,1,2,3,4,6,8,10,11,12,14,...,16871,16872,16873,16874,16875,16876,16878,16884,16885,16886
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,2.0,1.0,3.0,3.0,2.0,2.0,3.0,3.0,2.0,...,3.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,3.0


In [23]:
matrix2df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 16886
Columns: 10000 entries, 1 to 16886
dtypes: float64(10000)
memory usage: 763.0 MB


## 2. Model building

Create a function to fit and test models. The algorithm perfoms as following : for a given sequence, ranking the sequences bu ascending order based on the similarity between sthe sequences;  looking for the N first sequences closest to it based on the above similarity matrix and a cutoff; computing the confidence of the target sequence  by the one of all N neighnors sequences with a weight following the similarity degre of each learning sequence.

Hyperparameters: number of neighbours, distance mesure betweenn sequences, number of top products being suggested

In [24]:
def Model(k_closest = None, dist = None, top =  None, mode = None):
    '''
    k_closest = number of neighbours
    dist = distance mesure betweenn sequences
    top = number of top products being suggested
    '''
    if mode == 'prediction':
        mat = new_matrix
    else:
        mat = matrix2df        
        
    predictions = []
    for i in range(len(mat)):
        n_first = pd.Series.nsmallest(mat.iloc[i,:], n = k_closest , keep = 'first')
        nsmall_first = n_first.loc[ n_first < dist]
        t = pd.DataFrame(np.zeros((0,2)), columns = ['product','conf'])
        for idx in nsmall_first.index:
            t1 = newtargets.loc[newtargets['triplet_id'] == idx, ['product','conf']]
            if nsmall_first[idx]  ==  0:
                t1['conf'] = t1['conf']*1
                t = pd.concat([t,t1], axis = 0, ignore_index = False)
            elif nsmall_first[idx]  ==  1:
                t1['conf'] = t1['conf']*float(0.75)
                t = pd.concat([t,t1], axis = 0, ignore_index = False)
            elif nsmall_first[idx]  ==  2:
                t1['conf'] = t1['conf']*float(0.5)
                t = pd.concat([t,t1], axis = 0, ignore_index = False)
            else:
                t1 = newtargets.loc[newtargets['triplet_id'] == idx, ['product','conf']]
                t1['conf'] = t1['conf']*float(0.25)
                t = pd.concat([t,t1], axis = 0, ignore_index = False)

        pred = t.groupby(['product']).sum().reset_index()
        pred = pred.sort_values(by = 'conf', ascending = False)
        prediction = {}
        for i in range(len(pred)):
            k =  pred['product'][i]
            v = pred['conf'][i]
            prediction[k] = v    
        prediction = Counter(prediction).most_common(top)
        predictions.append(prediction)
        
    return predictions

In [25]:
# define the set of hyperparameters of the model
k_closest, dist, top =  10,2,10

In [27]:
# # fit a model using joblib to speed up the training time
start_time = time.time()
process = Parallel(n_jobs=-1)(delayed(Model)(arg1, arg2, arg3) for arg1, arg2, arg3 in [(k_closest, dist, top)])
print("--- %s seconds ---" % (time.time() - start_time))

--- 300.07552433013916 seconds ---


In [28]:
out_train = []
for i in range(len(triplets)):
    #out_train1.append([i[0] for i in process1[i]])
    out_train.append([i[0] for i in process[0][i]])
    

In [29]:
target_train = []
for i in np.arange(0,100000,10):
    target_train.append(targets['product'][i:i+10].tolist())

In [30]:
#result1 = [len(np.intersect1d(i,j))/top for i,j in zip(out_train1,target_train)]
result = [len(np.intersect1d(i,j))/top for i,j in zip(out_train,target_train)]

In [31]:
#print(np.sum(result1))
acc = np.sum(result)/len(triplets)

In [32]:
print("Training accuracy: %.2f" %(acc*100) + '%')

Training accuracy: 71.29%


## 3. Submission

In [33]:
validset = pd.read_csv('test_set_triplets.csv',header =  None)

In [34]:
validset.columns =  triplets.columns

In [35]:
validset1 = validset[['product_1','product_2','product_3']]

In [36]:
val_input = []
for i,row in validset1.iterrows():
    r = [str(i) for i in row]
    val_input.append(r)  

In [37]:
((x,y) for x in val_input for y in p_array.tolist())
new_dist=[editdistance.eval(x,y) for x in val_input for y in p_array.tolist()]
new_dist=np.array(new_dist).reshape(len(val_input),len(p_array.tolist()))
new_matrix=pd.DataFrame(new_dist)
new_matrix.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,2,1,2,3,3,3,3,2,3,1,...,3,3,3,3,3,3,3,3,3,3
1,2,2,2,3,3,3,3,2,3,2,...,3,3,3,3,3,3,3,3,3,3


In [38]:
outputs = Model(k_closest, dist, top, mode = 'prediction')

In [39]:
submit = []
for i in range(len(validset)):
    submit.append([i[0] for i in outputs[i]])

In [40]:
submit = pd.DataFrame(submit)
submit.insert(0,'id',validset.iloc[:,0])
submit.head(1)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9
0,9,13829,13830,13828,13831,13826,13832,13827,13833,13834,13835


In [41]:
submit.to_csv('submission_AR.csv', header=False, index= None)

In [44]:
print(editdistance.eval([9763,10007,11938],[9763,10007,11938]))
print(editdistance.eval([9763,11938,10007],[9763,10007,11938]))

0
2
