In [539]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import r2_score
import numpy as np

In [540]:
# Step 1: Data Preprocessing
# Load data
boxes = pd.read_csv("boxes.csv")
purchase = pd.read_csv("purchase.csv")
problem = pd.read_csv("problem 3.csv")

In [541]:
boxes.isna().sum()

BOX_ID             0
QUALITY            0
DELIVERY_OPTION    0
MILK               0
MEAT               0
UNIT_PRICE         0
dtype: int64

In [542]:
purchase.isna().sum()

PURCHASE_DATE     0
MAGIC_KEY         0
BOX_ID           47
BOX_COUNT        47
dtype: int64

In [543]:
purchase = purchase.dropna()

In [544]:
purchase.isna().sum()

PURCHASE_DATE    0
MAGIC_KEY        0
BOX_ID           0
BOX_COUNT        0
dtype: int64

In [545]:
# Convert categorical variables to numerical representations
boxes = pd.get_dummies(boxes, columns=['QUALITY', 'DELIVERY_OPTION'])

# Extract features from the date column
purchase['PURCHASE_DATE'] = pd.to_datetime(purchase['PURCHASE_DATE'],format='%d/%m/%Y')
purchase['DAY'] = purchase['PURCHASE_DATE'].dt.day
purchase['MONTH'] = purchase['PURCHASE_DATE'].dt.month
purchase['YEAR'] = purchase['PURCHASE_DATE'].dt.year

# Filter purchases made in the first 15 days of March 2019
march_purchases = purchase[purchase['BOX_COUNT']>0]

In [546]:
march_purchases

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,DAY,MONTH,YEAR
0,2019-02-01,2CED678A247,12.0,1.0,1,2,2019
1,2019-02-01,2BF58D91BA1,12.0,1.0,1,2,2019
2,2019-02-01,2C15B86534E,99.0,1.0,1,2,2019
3,2019-02-01,2C32D9A859A,6.0,1.0,1,2,2019
4,2019-02-01,2C7A55404D1,4.0,1.0,1,2,2019
...,...,...,...,...,...,...,...
2455859,2018-10-28,2BD992B5538,12.0,1.0,28,10,2018
2455860,2018-10-28,2C97CD72233,17.0,1.0,28,10,2018
2455861,2018-10-28,2C91C61D372,40.0,1.0,28,10,2018
2455862,2018-10-28,2CD70CFC4E3,51.0,1.0,28,10,2018


In [547]:
merged_df = pd.merge(march_purchases, boxes, on='BOX_ID')

In [548]:
merged_df

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,DAY,MONTH,YEAR,MILK,MEAT,UNIT_PRICE,QUALITY_Premium,QUALITY_Standard,DELIVERY_OPTION_Delivery from Collection Point,DELIVERY_OPTION_Home Delivery - CoD,DELIVERY_OPTION_Home Delivery - Digital Payment
0,2019-02-01,2CED678A247,12.0,1.0,1,2,2019,8.0,1.5,12.98,True,False,False,True,False
1,2019-02-01,2BF58D91BA1,12.0,1.0,1,2,2019,8.0,1.5,12.98,True,False,False,True,False
2,2019-02-01,2C15B86534E,99.0,1.0,1,2,2019,0.0,3.3,13.96,True,False,True,False,False
3,2019-02-01,2C32D9A859A,6.0,1.0,1,2,2019,0.0,2.7,11.96,True,False,False,True,False
4,2019-02-01,2C7A55404D1,4.0,1.0,1,2,2019,0.0,2.5,11.96,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455795,2018-10-28,2BD992B5538,12.0,1.0,28,10,2018,8.0,1.5,12.98,True,False,False,True,False
2455796,2018-10-28,2C97CD72233,17.0,1.0,28,10,2018,10.0,1.8,12.98,True,False,False,True,False
2455797,2018-10-28,2C91C61D372,40.0,1.0,28,10,2018,12.0,1.8,19.98,True,False,False,True,False
2455798,2018-10-28,2CD70CFC4E3,51.0,1.0,28,10,2018,18.0,2.9,23.98,True,False,False,True,False


In [549]:
# Group by MAGIC_KEY and check if they purchased milk or meat
purchase_summary = merged_df.groupby(['MAGIC_KEY','BOX_COUNT']).agg({'MEAT': 'sum'}).reset_index()

In [550]:
purchase_summary['PURCHASE'] = purchase_summary.apply(lambda row: 1.0 if row['MEAT'] > 0 else 0.0, axis=1)

In [551]:
purchase_summary.drop_duplicates(subset=['MAGIC_KEY'])

Unnamed: 0,MAGIC_KEY,BOX_COUNT,MEAT,PURCHASE
0,249670911D8,1.0,2.4,1.0
1,249751FC4DD,1.0,1.8,1.0
2,24978027606,1.0,2.9,1.0
3,24979164422,1.0,2.5,1.0
4,2497B8B4FDA,1.0,5.4,1.0
...,...,...,...,...
1275720,2E6F72C6F1C,1.0,4.8,1.0
1275721,2E6F8194908,1.0,2.4,1.0
1275722,2E6F9C7B9B4,1.0,2.2,1.0
1275723,2E6FB0EBB32,1.0,12.8,1.0


In [552]:
purchase_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275725 entries, 0 to 1275724
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   MAGIC_KEY  1275725 non-null  object 
 1   BOX_COUNT  1275725 non-null  float64
 2   MEAT       1275725 non-null  float64
 3   PURCHASE   1275725 non-null  float64
dtypes: float64(3), object(1)
memory usage: 38.9+ MB


In [553]:
problem_data = pd.merge(problem, purchase_summary, on='MAGIC_KEY', how='left')

In [554]:
problem_data.nunique()

MAGIC_KEY    5379
BOX_COUNT      11
MEAT          261
PURCHASE        2
dtype: int64

In [555]:
problem_data.drop_duplicates(subset=['MAGIC_KEY'])

Unnamed: 0,MAGIC_KEY,BOX_COUNT,MEAT,PURCHASE
0,2BCFE9C06A7,1.0,3.6,1.0
5,2C2A872B5A2,1.0,4.0,1.0
6,2C6A897671B,1.0,6.6,1.0
7,2C6F1287F53,1.0,13.8,1.0
12,2C658198CC9,1.0,6.6,1.0
...,...,...,...,...
5576,2C0804EFE49,1.0,2.5,1.0
5577,2C080B48630,1.0,0.0,0.0
5578,2C08243C58E,1.0,2.4,1.0
5579,2C082C78575,1.0,1.8,1.0


In [556]:
problem_data['MEAT'] = problem_data['MEAT'].fillna(problem_data['MEAT'].mean())
problem_data['PURCHASE'] = problem_data['PURCHASE'].fillna(1)

In [557]:
problem_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5581 entries, 0 to 5580
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MAGIC_KEY  5581 non-null   object 
 1   BOX_COUNT  5580 non-null   float64
 2   MEAT       5581 non-null   float64
 3   PURCHASE   5581 non-null   float64
dtypes: float64(3), object(1)
memory usage: 174.5+ KB


In [558]:
# Split data into features and target
X = problem_data.drop(['MAGIC_KEY', 'MEAT'], axis=1)
y = problem_data['MEAT']

In [559]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest classifier
rf_classifier = RandomForestRegressor()
rf_classifier.fit(X_train, y_train)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
# rf_grid = {"n_estimators": np.arange(10, 1000, 50),
#            "max_depth": [None, 3, 5, 10],
#            "min_samples_split": np.arange(2, 20, 2),
#            "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# 
# np.random.seed(42)
# 
# rs_model = RandomizedSearchCV(RandomForestRegressor(),
#                               param_distributions=rf_grid,
#                               cv=5,
#                               n_iter=20,
#                               verbose=True)
# 
# rs_model.fit(X_train, y_train)

In [None]:
# rs_model.best_params_

In [None]:
# Step 4: Model Evaluation
# Predict on test set
y_pred = rf_classifier.predict(X_test)

# Evaluate model
accuracy = r2_score(y_test, y_pred)
print("Accuracy:", accuracy*100)


In [None]:
predictions = rf_classifier.predict(X)

In [None]:
len(predictions)

In [None]:
predictions = np.round(predictions,1)

In [None]:
submission = pd.DataFrame({
    'MAGIC_KEY': problem['MAGIC_KEY'],
    'MEAT': [pred for pred in predictions ]
})

In [None]:
submission.to_csv('submission.csv', index=False)