In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [12]:
# Load data between 2026 and 2018

df = pd.read_csv('../data/Sports_and_Outdoors_5_2016_2018.csv')
df = df.dropna()
df = df[['reviewerID', 'asin']]
df = df.drop_duplicates()
df = df.groupby('reviewerID')['asin'].apply(list).reset_index(name='items')
df['items'] = df['items'].apply(lambda x: list(set(x)))
df = df.head(100000)

In [None]:
# Add a columns for the len of items list
df['len'] = df['items'].apply(lambda x: len(x))
df.head()

In [20]:
# Add weight to rows according to the number of combinations in the items list (2** len(items))
df['weight'] = df['len'].apply(lambda x: 2**x)
df.head()

Decimal('5.192296858534827628530496329E+33')

100000

In [15]:
# Add a column cumweight to calculate the cumulative weight
df['cumweight'] = df['weight'].cumsum()
sum_weight = df['weight'].sum()


Decimal('5.192301818663394343626532155E+33')

In [16]:
print(np.max(df['len']))
print(np.sort(df['len'])[-10:])

In [18]:
import random as rd

def binary_search(list, element):
    first = 0
    last = len(list) - 1
    while first <= last:
        mid = (first + last) // 2
        if list[mid] == element:
            return mid  # Element found, return its index
        elif element < list[mid]:
            last = mid - 1
        else:
            first = mid + 1
    return first  # Element not found, return the index where it should be inserted


def sample(transactions):
    """
    Sample a pattern from the transactional database according to its frequency
    :param transactions: transactional database
    :return: a pattern
    """
    # Generate a random number between 0 and the sum of the weights
    r = np.random.randint(0, sum_weight)
    # Find the index of the element in the cumulative weight list using a binary search
    index = binary_search(df['cumweight'].tolist(), r)
    # Sample item in the transaction to create a pattern
    pattern = []
    for item in df['items'][index]:
        if rd.Random().random() <= 0.5:
            pattern.append(item)
    return pattern

ValueError: high is out of bounds for int64

In [13]:
# Generate 1000 patterns
patterns = []
for i in range(1000):
    patterns.append(sample(df))

ValueError: high is out of bounds for int64

In [None]:
# Plot the pattern length distribution
pattern_len = [len(pattern) for pattern in patterns]
plt.xlabel('Pattern length')
plt.ylabel('Frequency')
# Add the mean len of transaction to the plot
plt.axvline(np.mean(pattern_len), color='red', linestyle='dashed', linewidth=1)
plt.hist(pattern_len, bins=20)

In [None]:
# Compute and plot support of 1000 patterns
supports = []
for pattern in patterns:
    support = df['items'].apply(lambda x: set(pattern).issubset(set(x))).sum()
    supports.append(support)
plt.xlabel('Support')
plt.ylabel('Frequency')
# Add the mean support of transaction to the plot
plt.axvline(np.mean(supports), color='red', linestyle='dashed', linewidth=1)
plt.hist(supports, bins=20)
# Add the transactions len distribution to the plot in red with an alpha of 0.5
plt.hist(df['len'], bins=20, color='red', alpha=0.5)

In [1]:
# Show the first 10 patterns
patterns[:10]


NameError: name 'patterns' is not defined

In [None]:
# Sample one pattern and compute its support
pattern = sample(df)

In [None]:
# Compute the support of the pattern
support = df['items'].apply(lambda x: set(pattern).issubset(set(x))).sum()

In [None]:
# Reset the dataframe by removing the weight and cumweight columns
df = df.drop(['weight', 'cumweight'], axis=1)

## Frequent pattern sampling with length constraint

In [7]:
def factorial(n):
    if n < 0:
        raise ValueError('n must be positive')
    if n == 0:
        return 1
    else:
        return n * factorial(n - 1)

def combination(n, k):
    return factorial(n) / (factorial(k) * factorial(n - k))

In [None]:
# Add weight to df
def add_weight(df, min_len, max_len):
    df['weight'] = df['len'].apply(lambda x: np.array([combination(x,k) for k in range(min_len, max_len)]).sum())
    df['cumweight'] = df['weight'].cumsum()
    return df

In [None]:
df = add_weight(df, MIN_LEN, MAX_LEN)

# High utility pattern sampling with QPlus


In [1]:
# Load metadata
df_meta = pd.read_csv('drive/MyDrive/meta_Sports_and_Outdoors.csv')
df_meta = df_meta[['asin', 'price', 'brand']]
df_meta = df_meta.dropna()
df_meta = df_meta.drop_duplicates()
# Remove $ sign
def is_price(x):
    try:
        float(x)
        return True
    except:
        return False
def remove_dollar(x):
    """
    Parse price column to float
    :param x: 
    :return: 
    """
    x = x.replace('$', '')
    # Remove space 
    x = x.replace(' ', '')
    # If x contains a range, take the mean
    try:
        if '-' in x:
            x = x.split('-')

            return (float(x[0]) + float(x[1])) / 2
        return float(x)
    except:
        print(x)
        return 0
df_meta['price'] = df_meta['price'].apply(remove_dollar)
# Count number of 0
print('Number of 0 in price column: {}'.format(df_meta['price'].apply(lambda x: x == 0).sum()))

# For all items in the same brand, replace 0 by the mean of the dataframe prices
mean_price = df_meta[df_meta['price'] != 0]['price'].mean()
df_meta['price'] = df_meta['price'].apply(lambda x: mean_price if x == 0 else x)

print('Number of 0 in price column: {}'.format(df_meta['price'].apply(lambda x: x == 0).sum()))

NameError: name 'pd' is not defined

In [None]:
# Load data
df = pd.read_csv('../data/Sports_and_Outdoors_5_2016_2018.csv')
df = df.dropna()
df = df[['reviewerID', 'asin', 'overall']]
df = df.drop_duplicates()
# Group by reviewerID and build a list of (asin, overall) tuples
df = df.groupby('reviewerID')[['asin', 'overall']].apply(lambda x: list(zip(x['asin'], x['overall']))).reset_index(name='items')
df['items'] = df['items'].apply(lambda x: list(set(x)))
df = df.head(100000)


In [None]:
def get_profit(item, profit_db):
    """
    Get the profit of an item
    :param item: 
    :param profit_db: dictionary of profit
    :return: 
    """
    if item in profit_db:
        return profit_db[item]
    else:
        print('Item {} not found'.format(item))
        return 0
    return profit_db[item] if item in profit_db else 0

In [None]:
def u(pattern, transaction, profit_db):
    """
    Compute the utility of a pattern in a transaction
    :param pattern: 
    :param transaction: 
    :return: 
    """
    utility = 0
    for item in pattern:
        for i in range(len(transaction)):
            if item == transaction[i][0]:
                profit = profit_db[profit_db['asin'] == item]['price'].values[0]
                quantity = transaction[i][1]
                utility += quantity * profit
    return utility
                

In [None]:
def U(pattern, transactions, profit_db):
    """
    Compute the utility of a pattern in a transaction
    :param pattern: 
    :param transaction: 
    :return: 
    """
    utility = 0
    for transaction in transactions:
        utility += u(pattern, transaction, profit_db)
    return utility

In [None]:
def v(l, i, transaction, profit_db):
    if l == 1:
        v = 0
        for j in range(i):
            w = transaction[j][1] * profit_db[transaction[j][0]]
            v+= w
    else:
        return combination(i-1, l-1) * v(1, i)
    
def f(l, nu, M):
    return 1 / (M - nu + 1)

In [None]:
# Build a profit dictionary
profit_db = {}
for id, row in df_meta.iterrows():
    profit_db[row['asin']] = row['price']

In [None]:
# Preprocessing of QPlus, assigning weight to transaction, once it's done we can perform as sampling as we need
import time
def preprocessing(transaction, nu,M, profit_db):
    w = 0
        # if l is inf
    if M == np.inf:
        return 2**(len(transaction)-1) * sum([item[1]* profit_db[item[0]] for item in transaction])
    else:
        v_1_i = v(1, len(transaction), transaction, profit_db)
        for l in range(nu, M):
            v_l_i = combination(len(transaction)-1, l-1) * v_1_i
            w += v_l_i * f(l , nu, M)
        return w

start = time.time()
weights = []
for transaction in tqdm(df['items']):
    weights.append(preprocessing(transaction, 1, 3, df_meta))
df['weight'] = weights
end = time.time()
print('Time elapsed: {}'.format(end - start))
df['cumweight'] = df['weight'].cumsum()

In [None]:
def qplus(df, nu, M, profit_db):
    """
    Sample a pattern from the transactional database according to its utility
    :param df:
    :param nu:
    :param M:
    :return:
    """

    # Sample pattern
    # Draw a transaction t from df proportionally to its weight
    random = Decimal(np.random.random()) * df['weight'].sum()
    index = binary_search(df['cumweight'].tolist(), random)
    transaction = df['items'][index]
    # Draw integer l from nu to M<with a probaility equals to
    length_weight = [v(l, len(transaction), transaction, profit_db) * f(l , nu, M) / df['weight'][index] for l in range(nu, M)]
    print(length_weight)
    # Normalize length_weight
    length_weight = [w / sum(length_weight) for w in length_weight]
    l = np.random.choice(range(nu, M), p=length_weight)
    pattern = []
    j = len(transaction)
    while l > 0:
        pattern_utility = U(pattern, transaction, profit_db)
        alpha = np.random.random() * v(l,j, transaction, profit_db) +combination(j-1, l-1) * pattern_utility
        i = 1
        binf =v(l, i-1, transaction, profit_db) + combination(i-1, l) * pattern_utility
        bsup = v(l, i, transaction, profit_db) + combination(i, l) * pattern_utility
        i = jump_dichotomic_search(alpha, l, pattern_utility, transaction, profit_db, binf, bsup)
        pattern.append(transaction[i])
        l -= 1
        j = i
    return pattern
pattern = qplus(df, 3, 5, df_meta)

In [None]:
# Print pattern utility
U(pattern, df['items'], profit_db)