In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import r2_score
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [34]:
# Step 1: Data Preprocessing
# Load data
boxes = pd.read_csv("boxes.csv")
purchase = pd.read_csv("purchase.csv")
problem = pd.read_csv("problem 3.csv")

In [35]:
boxes.isna().sum()

BOX_ID             0
QUALITY            0
DELIVERY_OPTION    0
MILK               0
MEAT               0
UNIT_PRICE         0
dtype: int64

In [36]:
purchase.isna().sum()

PURCHASE_DATE     0
MAGIC_KEY         0
BOX_ID           47
BOX_COUNT        47
dtype: int64

In [37]:
purchase['BOX_ID'] = purchase['BOX_ID'].fillna(purchase['BOX_ID'].ffill(),inplace=False)

In [38]:
purchase['BOX_COUNT'] = purchase['BOX_COUNT'].fillna(1,inplace=False)

In [39]:
# purchase = purchase.dropna()

In [40]:
purchase.isna().sum()

PURCHASE_DATE    0
MAGIC_KEY        0
BOX_ID           0
BOX_COUNT        0
dtype: int64

In [41]:
# Convert categorical variables to numerical representations
# boxes = pd.get_dummies(boxes, columns=['QUALITY', 'DELIVERY_OPTION'])

# Extract features from the date column
purchase['PURCHASE_DATE'] = pd.to_datetime(purchase['PURCHASE_DATE'],format='%d/%m/%Y')
purchase['DAY'] = purchase['PURCHASE_DATE'].dt.day
purchase['MONTH'] = purchase['PURCHASE_DATE'].dt.month
purchase['YEAR'] = purchase['PURCHASE_DATE'].dt.year

# Filter purchases made in the first 15 days of March 2019
# march_purchases = purchase[purchase['BOX_COUNT']>0]

In [42]:
# march_purchases

In [43]:
merged_df = pd.merge(purchase, boxes, on='BOX_ID')

In [44]:
merged_df

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,DAY,MONTH,YEAR,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2019-02-01,2CED678A247,12.0,1.0,1,2,2019,Premium,Home Delivery - CoD,8.0,1.5,12.98
1,2019-02-01,2BF58D91BA1,12.0,1.0,1,2,2019,Premium,Home Delivery - CoD,8.0,1.5,12.98
2,2019-02-01,2C15B86534E,99.0,1.0,1,2,2019,Premium,Delivery from Collection Point,0.0,3.3,13.96
3,2019-02-01,2C32D9A859A,6.0,1.0,1,2,2019,Premium,Home Delivery - CoD,0.0,2.7,11.96
4,2019-02-01,2C7A55404D1,4.0,1.0,1,2,2019,Premium,Home Delivery - CoD,0.0,2.5,11.96
...,...,...,...,...,...,...,...,...,...,...,...,...
2455842,2018-10-28,2BD992B5538,12.0,1.0,28,10,2018,Premium,Home Delivery - CoD,8.0,1.5,12.98
2455843,2018-10-28,2C97CD72233,17.0,1.0,28,10,2018,Premium,Home Delivery - CoD,10.0,1.8,12.98
2455844,2018-10-28,2C91C61D372,40.0,1.0,28,10,2018,Premium,Home Delivery - CoD,12.0,1.8,19.98
2455845,2018-10-28,2CD70CFC4E3,51.0,1.0,28,10,2018,Premium,Home Delivery - CoD,18.0,2.9,23.98


In [45]:
le = LabelEncoder()
merged_df['QUALITY'] = le.fit_transform(merged_df['QUALITY'])
merged_df['DELIVERY_OPTION'] = le.fit_transform(merged_df['DELIVERY_OPTION'])

In [46]:
merged_df

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,DAY,MONTH,YEAR,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2019-02-01,2CED678A247,12.0,1.0,1,2,2019,0,1,8.0,1.5,12.98
1,2019-02-01,2BF58D91BA1,12.0,1.0,1,2,2019,0,1,8.0,1.5,12.98
2,2019-02-01,2C15B86534E,99.0,1.0,1,2,2019,0,0,0.0,3.3,13.96
3,2019-02-01,2C32D9A859A,6.0,1.0,1,2,2019,0,1,0.0,2.7,11.96
4,2019-02-01,2C7A55404D1,4.0,1.0,1,2,2019,0,1,0.0,2.5,11.96
...,...,...,...,...,...,...,...,...,...,...,...,...
2455842,2018-10-28,2BD992B5538,12.0,1.0,28,10,2018,0,1,8.0,1.5,12.98
2455843,2018-10-28,2C97CD72233,17.0,1.0,28,10,2018,0,1,10.0,1.8,12.98
2455844,2018-10-28,2C91C61D372,40.0,1.0,28,10,2018,0,1,12.0,1.8,19.98
2455845,2018-10-28,2CD70CFC4E3,51.0,1.0,28,10,2018,0,1,18.0,2.9,23.98


In [47]:
# Group by MAGIC_KEY and check if they purchased milk or meat
purchase_summary = merged_df.groupby(['MAGIC_KEY','BOX_COUNT','BOX_ID', 'UNIT_PRICE','QUALITY','DELIVERY_OPTION']).agg({'MEAT': 'sum'}).reset_index()

In [48]:
purchase_summary['MEAT_BUY'] = purchase_summary.apply(lambda row: row['MEAT']if row['MEAT'] > 0 else 0.0, axis=1)

In [49]:
purchase_summary.drop_duplicates(subset=['MAGIC_KEY'])

Unnamed: 0,MAGIC_KEY,BOX_COUNT,BOX_ID,UNIT_PRICE,QUALITY,DELIVERY_OPTION,MEAT,MEAT_BUY
0,249670911D8,1.0,231.0,10.14,1,0,0.0,0.0
2,249751FC4DD,1.0,260.0,17.98,1,0,1.8,1.8
3,24978027606,1.0,27.0,15.96,0,1,2.9,2.9
4,24979164422,1.0,246.0,13.96,1,0,2.5,2.5
5,2497B8B4FDA,1.0,255.0,15.96,1,0,3.6,3.6
...,...,...,...,...,...,...,...,...
1852651,2E6F72C6F1C,1.0,126.0,15.98,0,0,3.0,3.0
1852653,2E6F8194908,1.0,76.0,11.96,0,0,2.4,2.4
1852654,2E6F9C7B9B4,1.0,144.0,19.98,0,0,2.2,2.2
1852655,2E6FB0EBB32,1.0,258.0,17.98,1,0,11.0,11.0


In [50]:
purchase_summary.drop(['MEAT'], axis=1, inplace=True)

In [51]:
purchase_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852658 entries, 0 to 1852657
Data columns (total 7 columns):
 #   Column           Dtype  
---  ------           -----  
 0   MAGIC_KEY        object 
 1   BOX_COUNT        float64
 2   BOX_ID           float64
 3   UNIT_PRICE       float64
 4   QUALITY          int32  
 5   DELIVERY_OPTION  int32  
 6   MEAT_BUY         float64
dtypes: float64(4), int32(2), object(1)
memory usage: 84.8+ MB


In [52]:
problem_data = pd.merge(problem, purchase_summary, on='MAGIC_KEY', how='left')

In [53]:
problem_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9072 entries, 0 to 9071
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MAGIC_KEY        9072 non-null   object 
 1   BOX_COUNT        9072 non-null   float64
 2   BOX_ID           9072 non-null   float64
 3   UNIT_PRICE       9072 non-null   float64
 4   QUALITY          9072 non-null   int32  
 5   DELIVERY_OPTION  9072 non-null   int32  
 6   MEAT_BUY         9072 non-null   float64
dtypes: float64(4), int32(2), object(1)
memory usage: 425.4+ KB


In [54]:
problem_data.nunique()

MAGIC_KEY          5379
BOX_COUNT            11
BOX_ID              167
UNIT_PRICE           15
QUALITY               2
DELIVERY_OPTION       3
MEAT_BUY             92
dtype: int64

In [55]:
problem_data = problem_data.drop_duplicates(subset=['MAGIC_KEY'])

In [56]:
problem_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5379 entries, 0 to 9070
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MAGIC_KEY        5379 non-null   object 
 1   BOX_COUNT        5379 non-null   float64
 2   BOX_ID           5379 non-null   float64
 3   UNIT_PRICE       5379 non-null   float64
 4   QUALITY          5379 non-null   int32  
 5   DELIVERY_OPTION  5379 non-null   int32  
 6   MEAT_BUY         5379 non-null   float64
dtypes: float64(4), int32(2), object(1)
memory usage: 294.2+ KB


In [103]:
# Split data into features and target
X = problem_data.drop(['MAGIC_KEY', 'MEAT_BUY','BOX_ID','QUALITY'], axis=1)
y = problem_data['MEAT_BUY']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest classifier
rf_classifier = RandomForestRegressor(n_estimators= 60,
 min_samples_split= 14,
 min_samples_leaf= 3,
 max_samples= 3000,
 max_features= 0.5,
 max_depth= None)
rf_classifier.fit(X_train, y_train)


In [105]:
# from sklearn.model_selection import RandomizedSearchCV
# 
# rf_grid = {"n_estimators": np.arange(10, 100, 10),
#            "max_depth": [None, 3, 5, 10], 
#            "min_samples_split": np.arange(2, 20, 2),
#            "min_samples_leaf": np.arange(1, 20, 2),
#            "max_features": [0.5, 1, "sqrt", "auto"],
#            "max_samples": [3000]}
# 
# rs_model = RandomizedSearchCV()
# 
# rs_model.fit(X_train, y_train)

In [106]:
# rs_model.best_params_

In [107]:
# Step 4: Model Evaluation
# Predict on test set
import math
from sklearn.metrics import mean_squared_error

y_pred = rf_classifier.predict(X_test)

# Evaluate model
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)



RMSE: 2.009849129929414


In [62]:
predictions = rf_classifier.predict(X)

In [63]:
len(predictions)

5379

In [64]:
predictions = np.round(predictions,1)

In [65]:
submission = pd.DataFrame({
    'MAGIC_KEY': problem['MAGIC_KEY'],
    'MEAT': [pred for pred in predictions ]
})

In [66]:
submission.to_csv('submission.csv', index=False)