In [None]:
import pandas as pd
import random
from ortools.algorithms.python import knapsack_solver
import math

In [None]:
df = pd.read_csv("../data/interim/marketing_sample_for_amazon_com-ecommerce__20200101_20200131__weight_price.csv")

In [None]:
df.head()

In [None]:
# Set a random seed for reproducibility
random_seed = 42
random.seed(random_seed)

shuffled_df = df.sample(frac=1, random_state=random_seed)

In [None]:
num_samples = 100
random_subset = shuffled_df.head(num_samples)

In [None]:
random_subset.describe()

In [None]:
solver = knapsack_solver.KnapsackSolver(
    knapsack_solver.SolverType.KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER,
    "KnapsackExample",
)

# Capacity of the knapsack (maximum weight allowed)
capacity = 25000


In [None]:
weights = random_subset['weight_kg'].tolist()
values = random_subset['price'].tolist()

# Convert weights and values to integers (required by OR-Tools)
weights = [int(w * 1000) for w in weights]  # Convert to grams (integer)
values = [int(v * 100) for v in values]  # Convert to currency (integer)

# Number of items
num_items = len(random_subset)

In [None]:
# Set the solver parameters
solver.init(values, [weights], [capacity])
solver.solve()

# Get the selected items (1 for selected, 0 for not selected)
selected_items = [solver.best_solution_contains(i) for i in range(num_items)]


In [None]:
selected_item_df = random_subset.iloc[selected_items]
selected_item_df

In [None]:
selected_item_df[['weight_kg','price']].sum()

In [None]:
# The solver came out at slightly over the desired weight, so we're going to convert the kg column into grams and try it again. 
def convert_grams(weight_kg):
    weight_grams = math.ceil(weight_kg * 1000)

    try:
        return int(weight_grams)
    except ValueError:
        return None

df['weight_grams'] = df['weight_kg'].apply(convert_grams)

In [None]:
shuffled_df = df.sample(frac=1, random_state=random_seed)
num_samples = 100
random_subset = shuffled_df.head(num_samples)
weights = random_subset['weight_grams'].tolist()

In [None]:
solver.init(values, [weights], [capacity])
solver.solve()
selected_items = [solver.best_solution_contains(i) for i in range(num_items)]

In [None]:
selected_item_df = random_subset.iloc[selected_items]
selected_item_df

In [None]:
selected_item_df[['weight_kg','weight_grams','price']].sum()

In [None]:
# The discrepancy still seems to exist here. 
# This is likely due to floating point arithmetic, I could use the decimal module here, but the results are close enough for this project.