In [1]:
import requests
import os
import sqlite3

In [2]:
#download binary file
def download_file_binary(url, file_name, dir_name=None):
    req = requests.get(url)
    if dir_name != None:
        file_name = os.path.join(dir_name, file_name)
    binary_file = open(file_name, "wb")
    binary_file.write(req.content)
    binary_file.close()

In [3]:
#get a table name from a txt file name
def get_table_name(file_name):
    if file_name.endswith('.txt'):
        table_name = file_name.split('.txt')[0]
    return table_name

In [4]:
#get a list of fact data from a txt file
def read_data_from_txt_file(file_name):
    content = []
    with open(file_name, 'rt', encoding='utf-8') as f:
        content = f.readlines()
        content = [x.strip() for x in content] 
    return content

In [5]:
#transform data read from txt file to two columns used in sqlite later
def transform_datalist(datalist):
    data = []
    for item in datalist:
        temp_list = item.split(",")
        #column id
        i1 = temp_list[0]
        #column product_list
        i2 = ",".join(temp_list[1:])
        new_item = [i1, i2]
        data.append(new_item)
    return data

In [6]:
#create a table and load data into it using the info read from a csv file or a worksheet
def load_file_to_database(cursor, table_name, columns=[], data=[]):
    #if the table exits, drop the table
    cursor.execute('DROP TABLE IF EXISTS {tn}'.format(tn=table_name))
    #create table
    cursor.execute('CREATE TABLE {tn} ({cn})'.format(tn=table_name,cn=', '.join(columns)))
    #load data into table
    for row in data:
        if row != [' ']:
            cursor.execute('INSERT INTO {tn} VALUES{d}'.format(tn=table_name,d=tuple(row)))    

define variables for sqlite

In [7]:
#create a sqlite connection
conn = sqlite3.connect('market_basket_analytics.db')
#create a cursor to execute sql script
cursor = conn.cursor()

download training dataset

In [8]:
#define variables
training_dataset_url = "http://kevincrook.com/utd/market_basket_training.txt"
training_dataset_file_name = "market_basket_training.txt"

In [9]:
#download the training dataset
download_file_binary(training_dataset_url, training_dataset_file_name)

load training dataset into database

In [10]:
#load training dataset into database
table_name = get_table_name(training_dataset_file_name)
columns = ["id","product_list"]
#read data from txt file
content = read_data_from_txt_file(training_dataset_file_name)
#transform data into two columns which will be used in sqlite
data = transform_datalist(content)

In [11]:
load_file_to_database(cursor, table_name, columns, data)

download test dataset

In [12]:
#define variables
test_dataset_url = "http://kevincrook.com/utd/market_basket_test.txt"
test_dataset_file_name = "market_basket_test.txt"

In [13]:
#download the test dataset
download_file_binary(test_dataset_url, test_dataset_file_name)

load test dataset into database

In [14]:
#load test dataset into database
table_name = get_table_name(test_dataset_file_name)
columns = ["id","product_list"]
#read data from txt file
content = read_data_from_txt_file(test_dataset_file_name)
#transform data into two columns which will be used in sqlite
data = transform_datalist(content)

In [15]:
load_file_to_database(cursor, table_name, columns, data)

calculate frequency

In [16]:
#generate table including frequency information
cursor.execute("DROP TABLE IF EXISTS frequency")
cursor.execute('CREATE TABLE frequency AS SELECT DISTINCT product_list, count(*) as freq \
                        FROM market_basket_training group by product_list order by product_list')

<sqlite3.Cursor at 0x1ffb3df36c0>

generate all product set and new product set

In [17]:
#create a set including all products
all_product_set = {"P01","P02","P03","P04","P05","P06","P07","P08","P09","P10"}
#read all combination of products from frequency table
cursor.execute('SELECT DISTINCT product_list FROM frequency')
freq_list = cursor.fetchall()
sold_product_list = []
#create a set including all sold products
for item in freq_list:
    temp_list = item[0].split(",")
    sold_product_list = sold_product_list + temp_list
sold_product_set = set(sold_product_list)
#get the set of new products which have not been sold
new_product_set = all_product_set - sold_product_set

get test data from sqlite

In [18]:
#read test data from market_basket_test table
cursor.execute("SELECT * FROM market_basket_test")
test_list = cursor.fetchall()

run algorithm to generate recommendation

In [19]:
#initiate recommendation list
recommendation_list = []

#process shopping carts one by one
for test in test_list:
    
    #get test number
    test_number = test[0]
    
    #get the list of products in this cart
    if len(test[1]) == 1:
        test_product_list = test[1][0]
    else:
        test_product_list = test[1].split(",")
        
    #remove new product
    product_set = set()
    for product in test_product_list:
        if product not in new_product_set:
            product_set.add(product)
        
    #initial variables
    max_freq = 0
    add_product = None
    
    #try to add an old product
    for old_product in sold_product_set:
        
        #if the old_product is in the cart already, move on
        if old_product in product_set:
            continue
        
        #add the old_product into the cart, and transform the list of product ot the format used in frequency table
        product_set.add(old_product)
        product_list = list(product_set)
        product_list.sort()
        product_list_freq = ",".join(product_list)  
        
        #get the frequency that this combination of products appeared in training dataset
        cursor.execute("SELECT freq FROM frequency where product_list = '{pl}'".format(pl=product_list_freq))
        freq_list = cursor.fetchall()
        if freq_list == []:
            freq = 0
        else:
            freq = freq_list[0][0]        
        
        #compare the frequency with the current max frequency from other processed combinations 
        if max_freq < max(max_freq, freq):
            #replace the max frequency and the product which should be added into the cart
            max_freq = freq
            add_product = old_product
        
        #clean the list for next interation
        product_set.remove(old_product)
    
    #append the recommendation for this cart into the recmmendation list
    recommendation_list.append((test_number,add_product))

write the recommendation list into a txt file

In [20]:
#open the txt file
with open('market_basket_recommendations.txt', 'wt', encoding='utf-8') as f:
    for item in recommendation_list:
        item_list = list(item)
        #convert the tuple into the required format
        f.write(item_list[0]+','+item_list[1])
        #append the end of line character
        f.write('\n')

In [21]:
#print the result
print(recommendation_list)

[('001', 'P09'), ('002', 'P09'), ('003', 'P06'), ('004', 'P09'), ('005', 'P09'), ('006', 'P09'), ('007', 'P09'), ('008', 'P09'), ('009', 'P09'), ('010', 'P09'), ('011', 'P06'), ('012', 'P09'), ('013', 'P09'), ('014', 'P09'), ('015', 'P09'), ('016', 'P09'), ('017', 'P09'), ('018', 'P09'), ('019', 'P06'), ('020', 'P09'), ('021', 'P09'), ('022', 'P06'), ('023', 'P06'), ('024', 'P03'), ('025', 'P09'), ('026', 'P09'), ('027', 'P01'), ('028', 'P01'), ('029', 'P01'), ('030', 'P01'), ('031', 'P01'), ('032', 'P09'), ('033', 'P09'), ('034', 'P01'), ('035', 'P09'), ('036', 'P09'), ('037', 'P01'), ('038', 'P01'), ('039', 'P01'), ('040', 'P09'), ('041', 'P09'), ('042', 'P01'), ('043', 'P01'), ('044', 'P01'), ('045', 'P09'), ('046', 'P09'), ('047', 'P01'), ('048', 'P01'), ('049', 'P01'), ('050', 'P09'), ('051', 'P09'), ('052', 'P09'), ('053', 'P01'), ('054', 'P01'), ('055', 'P01'), ('056', 'P01'), ('057', 'P09'), ('058', 'P09'), ('059', 'P01'), ('060', 'P01'), ('061', 'P01'), ('062', 'P01'), ('063',