<a href="https://colab.research.google.com/github/Eezzeldin/candy/blob/main/candy_ip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the datasets
final_df = pd.read_csv('/content/final_df.csv')
control_variables_ate_df = pd.read_csv('/content/control_variables_ate_df.csv')

# Display the first few rows of each dataframe
final_df.head(), control_variables_ate_df.head()


(              Feature Pair  Phi Coefficients  Supports  Confidences     Lifts  \
 0  ('chocolate', 'fruity')          0.718155  0.253068     0.011365  0.035294   
 1     ('chocolate', 'bar')          0.569653  0.269339     0.235400  0.176471   
 2      ('bar', 'pluribus')          0.564159  0.247066     0.000000  0.000000   
 3        ('fruity', 'bar')          0.486452  0.243994     0.000000  0.000000   
 4        ('nougat', 'bar')          0.463281  0.408796     0.082306  0.117647   
 
    CI Width Phi Coefficients  CI Width Supports  CI Width Confidences  \
 0                   0.026206           0.083333              0.057861   
 1                   0.541570           0.312895              2.221498   
 2                   0.000000           0.000000              0.000000   
 3                   0.000000           0.000000              0.000000   
 4                   1.000000           0.000000              4.204203   
 
    CI Width Lifts  Clusters         0         1  pred  
 0 

In [2]:
# Replacing 'rare' with 'peanutyalmondy' in the 'Unnamed: 0' column
control_variables_ate_df['Unnamed: 0'] = control_variables_ate_df['Unnamed: 0'].replace('rare', 'peanutyalmondy')

# Display the updated DataFrame to confirm the change
control_variables_ate_df.head()


Unnamed: 0.1,Unnamed: 0,percentwin_ate,perccentprice_ate,percentsugar_ate
0,chocolate,18.712478,0.281077,0.054015
1,fruity,-2.956356,-0.165029,-0.065401
2,caramel,3.205517,0.169726,0.145644
3,nougat,11.828607,-0.031817,0.212515
4,crispedricewafer,15.294365,0.338327,-0.02129


In [3]:
from scipy.optimize import linprog

# Extracting the variables from the DataFrame
variables = control_variables_ate_df['Unnamed: 0']
percentwin = control_variables_ate_df['percentwin_ate'].to_numpy()
percentprice = control_variables_ate_df['perccentprice_ate'].to_numpy()
percentsugar = control_variables_ate_df['percentsugar_ate'].to_numpy()

# Objective function: We want to maximize percentwin, but linprog does minimization,
# so we use the negative of percentwin to convert it to a maximization problem.
c = -percentwin

# Constraints: Adding the constraints for percentprice and percentsugar
A = [percentprice, percentsugar]
b = [0.5, 0.5]

# Bounds for each variable, indicating all are binary (either 0 or 1)
x_bounds = [(0, 1) for _ in variables]

# Solve the linear programming problem
res = linprog(c, A_ub=A, b_ub=b, bounds=x_bounds, method='highs', options={'disp': False})

# Parse and display the results
selected_variables = [variables[i] for i in range(len(variables)) if res.x[i] > 0.5]
selected_variables, res.fun


(['chocolate',
  'fruity',
  'nougat',
  'crispedricewafer',
  'pluribus',
  'peanutyalmondy'],
 -61.00157248924873)

In [7]:
# Extracting the names of the selected candies and their proportions
selected_candies     = control_variables_ate_df['Unnamed: 0'][res.x > 0].tolist()
selected_proportions = res.x[res.x > 0].tolist()

selected_candies, selected_proportions


(['chocolate',
  'fruity',
  'nougat',
  'crispedricewafer',
  'pluribus',
  'peanutyalmondy'],
 [1.0, 1.0, 1.0, 0.6435112696386143, 1.0, 1.0])

In [8]:
#!pip install pulp

Collecting pulp
  Downloading PuLP-2.8.0-py3-none-any.whl (17.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.7/17.7 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pulp
Successfully installed pulp-2.8.0


In [11]:
# Extract associated feature pairs from Cluster 1
associated_pairs = final_df[final_df['Clusters'] == 1]['Feature Pair'].apply(eval).tolist()

# Create a mapping from feature to associated features based on the associated pairs
feature_associations = {}
for pair in associated_pairs:
    for feature in pair:
        if feature not in feature_associations:
            feature_associations[feature] = set()
        feature_associations[feature].add(pair[0])
        feature_associations[feature].add(pair[1])
associated_pairs

[('chocolate', 'bar'),
 ('nougat', 'bar'),
 ('crispedricewafer', 'bar'),
 ('fruity', 'hard'),
 ('chocolate', 'peanutyalmondy'),
 ('chocolate', 'pluribus'),
 ('caramel', 'bar'),
 ('fruity', 'pluribus'),
 ('caramel', 'nougat'),
 ('caramel', 'pluribus'),
 ('peanutyalmondy', 'bar'),
 ('chocolate', 'caramel'),
 ('crispedricewafer', 'pluribus'),
 ('peanutyalmondy', 'pluribus'),
 ('caramel', 'crispedricewafer'),
 ('peanutyalmondy', 'nougat'),
 ('caramel', 'peanutyalmondy'),
 ('hard', 'pluribus')]

In [15]:
import pulp

# Assuming control_variables_ate_df and feature_associations are defined as before

# Create the LP problem
prob = pulp.LpProblem("MaximizePercentWinWithAssociations", pulp.LpMaximize)

# Define a very large number M
M = 1000

# Decision variables for features
feature_vars = pulp.LpVariable.dicts("Feature", control_variables_ate_df['Unnamed: 0'], cat='Binary')

# Objective: Maximize percentwin_ate
prob += pulp.lpSum([percentwin[i] * feature_vars[feature] for i, feature in enumerate(control_variables_ate_df['Unnamed: 0'])])

# Constraints: Keep perccentprice_ate and percentsugar_ate under 0.5
prob += pulp.lpSum([percentprice[i] * feature_vars[feature] for i, feature in enumerate(control_variables_ate_df['Unnamed: 0'])]) <= 1
#prob += pulp.lpSum([percentsugar[i] * feature_vars[feature] for i, feature in enumerate(control_variables_ate_df['Unnamed: 0'])]) <= 1

# Association constraints using the Large M method
for feature, associates in feature_associations.items():
    for associate in associates:
        if feature != associate:  # Ensure we don't create a constraint for a feature with itself
            # If feature is selected (1), then associate must also be selected (1),
            # else if feature is not selected (0), no restriction on associate (handled by the large M).
            prob += feature_vars[associate] >= feature_vars[feature] - M * (1 - feature_vars[feature]), f"Assoc_{feature}_{associate}"

# Solve the problem
prob.solve()

# Output the selected features
selected_features = [feature for feature in control_variables_ate_df['Unnamed: 0'] if feature_vars[feature].value() == 1]
print("Selected Features:", selected_features)


Selected Features: []


In [17]:
unassociated_pairs = final_df[final_df['Clusters'] == 0]['Feature Pair'].apply(eval).tolist()


In [18]:
import pulp

# Assuming control_variables_ate_df, percentwin, percentprice, and percentsugar are defined as before
# Assuming unassociated_pairs is a list of tuples representing unassociated feature pairs

# Create the LP problem
prob = pulp.LpProblem("MaximizePercentWinWithExclusions", pulp.LpMaximize)

# Define a very large number M
M = 1000

# Decision variables for features
feature_vars = pulp.LpVariable.dicts("Feature", control_variables_ate_df['Unnamed: 0'], cat='Binary')

# Objective: Maximize percentwin_ate
prob += pulp.lpSum([percentwin[i] * feature_vars[feature] for i, feature in enumerate(control_variables_ate_df['Unnamed: 0'])])

# Constraints: Keep perccentprice_ate and percentsugar_ate under 0.5
prob += pulp.lpSum([percentprice[i] * feature_vars[feature] for i, feature in enumerate(control_variables_ate_df['Unnamed: 0'])]) <= 0.5
prob += pulp.lpSum([percentsugar[i] * feature_vars[feature] for i, feature in enumerate(control_variables_ate_df['Unnamed: 0'])]) <= 0.5

# Unassociation constraints using the Large M method
for (feature_a, feature_b) in unassociated_pairs:
    if feature_a in feature_vars and feature_b in feature_vars:
        # If feature_a is selected, feature_b cannot be selected, and vice versa
        prob += feature_vars[feature_a] + feature_vars[feature_b] <= 1, f"Exclusion_{feature_a}_{feature_b}"

# Solve the problem
prob.solve()

# Output the selected features
selected_features = [feature for feature in control_variables_ate_df['Unnamed: 0'] if feature_vars[feature].value() == 1]
print("Selected Features:", selected_features)


Selected Features: ['chocolate', 'pluribus', 'peanutyalmondy']


In [19]:
# Assuming `selected_features` contains the names of the features that were selected by the optimization
# And assuming `control_variables_ate_df` contains the 'Unnamed: 0' column with feature names and 'percentwin_ate' with their corresponding values

# Calculate the maximum percentwin_ate for the selected features
max_percentwin_ate = sum(control_variables_ate_df[control_variables_ate_df['Unnamed: 0'].isin(selected_features)]['percentwin_ate'])

print("Maximum percentwin_ate:", max_percentwin_ate)


Maximum percentwin_ate: 42.28722527703786


In [20]:
# Calculate the total perccentprice_ate for the selected features
total_perccentprice_ate = sum(control_variables_ate_df[control_variables_ate_df['Unnamed: 0'].isin(selected_features)]['perccentprice_ate'])

# Calculate the total percentsugar_ate for the selected features
total_percentsugar_ate = sum(control_variables_ate_df[control_variables_ate_df['Unnamed: 0'].isin(selected_features)]['percentsugar_ate'])

print("Total perccentprice_ate:", total_perccentprice_ate)
print("Total percentsugar_ate:", total_percentsugar_ate)


Total perccentprice_ate: 0.4791284430716149
Total percentsugar_ate: 0.3265442567210482
