In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [42]:
# Step 1: Data Preprocessing
# Load data
boxes = pd.read_csv("boxes.csv")
purchase = pd.read_csv("purchase.csv")
problem = pd.read_csv("problem 1.csv")

In [43]:
boxes.isna().sum()

BOX_ID             0
QUALITY            0
DELIVERY_OPTION    0
MILK               0
MEAT               0
UNIT_PRICE         0
dtype: int64

In [44]:
purchase.isna().sum()

PURCHASE_DATE     0
MAGIC_KEY         0
BOX_ID           47
BOX_COUNT        47
dtype: int64

In [45]:
# Handle missing values if any

purchase.fillna(0.0, inplace=True)

# Convert categorical variables to numerical representations
boxes = pd.get_dummies(boxes, columns=['QUALITY', 'DELIVERY_OPTION'])

# Extract features from the date column
purchase['PURCHASE_DATE'] = pd.to_datetime(purchase['PURCHASE_DATE'],format='%d/%m/%Y')
purchase['DAY'] = purchase['PURCHASE_DATE'].dt.day
purchase['MONTH'] = purchase['PURCHASE_DATE'].dt.month
purchase['YEAR'] = purchase['PURCHASE_DATE'].dt.year

# Filter purchases made in the first 15 days of March 2019
march_purchases = purchase[(purchase['MONTH'] == 2) & (purchase['YEAR'] ==  2019) & (purchase['DAY'] <= 15)]


In [46]:
march_purchases

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,DAY,MONTH,YEAR
0,2019-02-01,2CED678A247,12.0,1.0,1,2,2019
1,2019-02-01,2BF58D91BA1,12.0,1.0,1,2,2019
2,2019-02-01,2C15B86534E,99.0,1.0,1,2,2019
3,2019-02-01,2C32D9A859A,6.0,1.0,1,2,2019
4,2019-02-01,2C7A55404D1,4.0,1.0,1,2,2019
...,...,...,...,...,...,...,...
297060,2019-02-15,2C998B122B1,23.0,1.0,15,2,2019
297061,2019-02-15,2C1C22E43D5,80.0,1.0,15,2,2019
297062,2019-02-15,28FE30D9AF3,246.0,1.0,15,2,2019
297063,2019-02-15,28D27B1698C,213.0,1.0,15,2,2019


In [47]:
march_purchases.isna().sum()

PURCHASE_DATE    0
MAGIC_KEY        0
BOX_ID           0
BOX_COUNT        0
DAY              0
MONTH            0
YEAR             0
dtype: int64

In [48]:
merged_df = pd.merge(march_purchases, boxes, on='BOX_ID')

In [49]:
merged_df

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,DAY,MONTH,YEAR,MILK,MEAT,UNIT_PRICE,QUALITY_Premium,QUALITY_Standard,DELIVERY_OPTION_Delivery from Collection Point,DELIVERY_OPTION_Home Delivery - CoD,DELIVERY_OPTION_Home Delivery - Digital Payment
0,2019-02-01,2CED678A247,12.0,1.0,1,2,2019,8.0,1.5,12.98,True,False,False,True,False
1,2019-02-01,2BF58D91BA1,12.0,1.0,1,2,2019,8.0,1.5,12.98,True,False,False,True,False
2,2019-02-01,2C15B86534E,99.0,1.0,1,2,2019,0.0,3.3,13.96,True,False,True,False,False
3,2019-02-01,2C32D9A859A,6.0,1.0,1,2,2019,0.0,2.7,11.96,True,False,False,True,False
4,2019-02-01,2C7A55404D1,4.0,1.0,1,2,2019,0.0,2.5,11.96,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297038,2019-02-15,2C998B122B1,23.0,1.0,15,2,2019,0.0,2.4,13.96,True,False,False,True,False
297039,2019-02-15,2C1C22E43D5,80.0,1.0,15,2,2019,0.0,2.9,11.96,True,False,True,False,False
297040,2019-02-15,28FE30D9AF3,246.0,1.0,15,2,2019,0.0,2.5,13.96,False,True,True,False,False
297041,2019-02-15,28D27B1698C,213.0,1.0,15,2,2019,0.0,3.6,15.96,False,True,False,True,False


In [50]:
# Group by MAGIC_KEY and check if they purchased milk or meat
purchase_summary = merged_df.groupby('MAGIC_KEY').agg({'MILK': 'sum', 'MEAT': 'sum', 'BOX_COUNT': 'sum','UNIT_PRICE':'sum'}).reset_index()

In [51]:
purchase_summary['PURCHASE'] = purchase_summary.apply(lambda row: 1.0 if (row['MILK'] > 0 or row['MEAT'] > 0) and (row['BOX_COUNT']>=1) else 0.0, axis=1)

In [52]:
purchase_summary.isna().sum()

MAGIC_KEY     0
MILK          0
MEAT          0
BOX_COUNT     0
UNIT_PRICE    0
PURCHASE      0
dtype: int64

In [53]:
problem_data = pd.merge(problem, purchase_summary, on='MAGIC_KEY', how='left')
problem_data.fillna(0, inplace=True)

In [54]:
problem_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58689 entries, 0 to 58688
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MAGIC_KEY   58689 non-null  object 
 1   MILK        58689 non-null  float64
 2   MEAT        58689 non-null  float64
 3   BOX_COUNT   58689 non-null  float64
 4   UNIT_PRICE  58689 non-null  float64
 5   PURCHASE    58689 non-null  float64
dtypes: float64(5), object(1)
memory usage: 2.7+ MB


In [55]:
# Split data into features and target
X = problem_data.drop(['MAGIC_KEY', 'PURCHASE'], axis=1)
y = problem_data['PURCHASE']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators= 610,min_samples_split= 18,min_samples_leaf= 1,max_depth= 5)
rf_classifier.fit(X_train, y_train)


In [57]:
# Step 4: Model Evaluation
# Predict on test set
y_pred = rf_classifier.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)


Accuracy: 100.0


In [58]:
predictions = rf_classifier.predict(X)


In [59]:
len(predictions)

58689

In [60]:
submission = pd.DataFrame({'MAGIC_KEY': problem['MAGIC_KEY'], 'purchase': ['Y' if pred > 0 else 'N' for pred in predictions]})
submission.to_csv('submission6.csv', index=False)