# PROBLEM 1
We need to predict which of the Magic Keys given in “problem 1.csv” will buy milk and/or meat in the first 15 days of March-2019. We have to put Y in the purchase column if the Magic Keys will purchase and N if the Magic Keys will not make a purchase. 

In [1]:
import pandas as pd
import numpy as np

## Data Preprocessing

In [2]:
purchase_df = pd.read_csv("purchase.csv")
boxes_df = pd.read_csv("boxes.csv")
problem_df = pd.read_csv("problem 1.csv")

In [3]:
# Utility Function

def analyze(df):
    print(f"Number of rows: {len(df)}")    
    print(f"Number of columns: {len(df.columns)}")
    
    info_df = pd.DataFrame({
        "NUNIQUE": df.nunique(),
        "DTYPE": df.dtypes
    })
    
    print()
    print(info_df)

### `purchase_df`

In [4]:
purchase_df.head()

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT
0,1/2/2019,2CED678A247,12.0,1.0
1,1/2/2019,2BF58D91BA1,12.0,1.0
2,1/2/2019,2C15B86534E,99.0,1.0
3,1/2/2019,2C32D9A859A,6.0,1.0
4,1/2/2019,2C7A55404D1,4.0,1.0


In [5]:
analyze(purchase_df)

Number of rows: 2455864
Number of columns: 4

               NUNIQUE    DTYPE
PURCHASE_DATE      151   object
MAGIC_KEY      1274108   object
BOX_ID             291  float64
BOX_COUNT           14  float64


In [6]:
purchase_df["MAGIC_KEY"] = purchase_df["MAGIC_KEY"].astype("category")
purchase_df["PURCHASE_DATE"] = pd.to_datetime(purchase_df["PURCHASE_DATE"],
                                              format="%d/%m/%Y")

In [7]:
purchase_df.head()

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT
0,2019-02-01,2CED678A247,12.0,1.0
1,2019-02-01,2BF58D91BA1,12.0,1.0
2,2019-02-01,2C15B86534E,99.0,1.0
3,2019-02-01,2C32D9A859A,6.0,1.0
4,2019-02-01,2C7A55404D1,4.0,1.0


### `boxes_df`

In [8]:
boxes_df.head()

Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,1,Premium,Home Delivery - CoD,0.0,2.7,9.96
1,2,Premium,Home Delivery - CoD,0.0,2.3,11.96
2,3,Premium,Home Delivery - CoD,0.0,2.4,11.96
3,4,Premium,Home Delivery - CoD,0.0,2.5,11.96
4,5,Premium,Home Delivery - CoD,0.0,2.6,11.96


In [9]:
analyze(boxes_df)

Number of rows: 290
Number of columns: 6

                 NUNIQUE    DTYPE
BOX_ID               290    int64
QUALITY                2   object
DELIVERY_OPTION        3   object
MILK                  28  float64
MEAT                  39  float64
UNIT_PRICE            18  float64


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

boxes_df["QUALITY"] = le.fit_transform(boxes_df["QUALITY"])
boxes_df["DELIVERY_OPTION"] = le.fit_transform(boxes_df["DELIVERY_OPTION"])

In [11]:
analyze(boxes_df)

Number of rows: 290
Number of columns: 6

                 NUNIQUE    DTYPE
BOX_ID               290    int64
QUALITY                2    int32
DELIVERY_OPTION        3    int32
MILK                  28  float64
MEAT                  39  float64
UNIT_PRICE            18  float64


In [12]:
purchase_df["EN_MAGIC_KEY"] = le.fit_transform(purchase_df["MAGIC_KEY"])

In [13]:
purchase_df["YEAR"] = purchase_df["PURCHASE_DATE"].dt.year
purchase_df["MONTH"] = purchase_df["PURCHASE_DATE"].dt.month
purchase_df["DAY"] = purchase_df["PURCHASE_DATE"].dt.day

purchase_df = purchase_df.drop(columns=["PURCHASE_DATE"])

In [14]:
purchase_df.head()

Unnamed: 0,MAGIC_KEY,BOX_ID,BOX_COUNT,EN_MAGIC_KEY,YEAR,MONTH,DAY
0,2CED678A247,12.0,1.0,1222234,2019,2,1
1,2BF58D91BA1,12.0,1.0,526391,2019,2,1
2,2C15B86534E,99.0,1.0,624102,2019,2,1
3,2C32D9A859A,6.0,1.0,708928,2019,2,1
4,2C7A55404D1,4.0,1.0,870797,2019,2,1


### `merged_df`

In [15]:
merged_df = pd.merge(purchase_df, boxes_df, on="BOX_ID", how="left")

In [16]:
merged_df

Unnamed: 0,MAGIC_KEY,BOX_ID,BOX_COUNT,EN_MAGIC_KEY,YEAR,MONTH,DAY,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2CED678A247,12.0,1.0,1222234,2019,2,1,0.0,1.0,8.0,1.5,12.98
1,2BF58D91BA1,12.0,1.0,526391,2019,2,1,0.0,1.0,8.0,1.5,12.98
2,2C15B86534E,99.0,1.0,624102,2019,2,1,0.0,0.0,0.0,3.3,13.96
3,2C32D9A859A,6.0,1.0,708928,2019,2,1,0.0,1.0,0.0,2.7,11.96
4,2C7A55404D1,4.0,1.0,870797,2019,2,1,0.0,1.0,0.0,2.5,11.96
...,...,...,...,...,...,...,...,...,...,...,...,...
2455859,2BD992B5538,12.0,1.0,425064,2018,10,28,0.0,1.0,8.0,1.5,12.98
2455860,2C97CD72233,17.0,1.0,950422,2018,10,28,0.0,1.0,10.0,1.8,12.98
2455861,2C91C61D372,40.0,1.0,933236,2018,10,28,0.0,1.0,12.0,1.8,19.98
2455862,2CD70CFC4E3,51.0,1.0,1145776,2018,10,28,0.0,1.0,18.0,2.9,23.98


In [17]:
# No. of rows per column having NaN value

merged_df.isna().sum()

MAGIC_KEY           0
BOX_ID             47
BOX_COUNT          47
EN_MAGIC_KEY        0
YEAR                0
MONTH               0
DAY                 0
QUALITY            64
DELIVERY_OPTION    64
MILK               64
MEAT               64
UNIT_PRICE         64
dtype: int64

In [18]:
merged_df[merged_df.isna().any(axis=1)]

Unnamed: 0,MAGIC_KEY,BOX_ID,BOX_COUNT,EN_MAGIC_KEY,YEAR,MONTH,DAY,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
177,28FF265F082,,,146179,2019,2,1,,,,,
178,2CB168CDFA3,,,1029966,2019,2,1,,,,,
179,2C2C8844F09,,,689457,2019,2,1,,,,,
180,2BDCCEF05A4,,,437086,2019,2,1,,,,,
181,2BF1D98D0B2,,,514526,2019,2,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2188029,2BE34886311,,,464138,2018,10,9,,,,,
2228380,2BE266F8C55,,,460626,2018,10,18,,,,,
2240877,2BDC3619EAF,,,434755,2018,10,18,,,,,
2243419,2BF1D98D0B2,,,514526,2018,10,17,,,,,


In [19]:
merged_df = merged_df.dropna(subset=["BOX_ID"])

In [20]:
merged_df["BOX_COUNT"].value_counts()

BOX_COUNT
 1.0     2453363
 2.0        2283
 3.0          85
 4.0          30
-1.0          17
 6.0          17
 5.0          10
 7.0           3
 9.0           2
 19.0          2
 11.0          2
 8.0           1
 10.0          1
 13.0          1
Name: count, dtype: int64

In [21]:
# As there are 17 rows of BOX_COUNT that are -1.0, we turn them into 0.0 along with MILK and MEAT
# These are the customers that didn't buy anything

# merged_df = merged_df.loc[merged_df["BOX_COUNT"] != -1.0]
merged_df.loc[merged_df["BOX_COUNT"] == -1.0, ["BOX_COUNT", "MILK", "MEAT"]] = 0.0

In [22]:
merged_df

Unnamed: 0,MAGIC_KEY,BOX_ID,BOX_COUNT,EN_MAGIC_KEY,YEAR,MONTH,DAY,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2CED678A247,12.0,1.0,1222234,2019,2,1,0.0,1.0,8.0,1.5,12.98
1,2BF58D91BA1,12.0,1.0,526391,2019,2,1,0.0,1.0,8.0,1.5,12.98
2,2C15B86534E,99.0,1.0,624102,2019,2,1,0.0,0.0,0.0,3.3,13.96
3,2C32D9A859A,6.0,1.0,708928,2019,2,1,0.0,1.0,0.0,2.7,11.96
4,2C7A55404D1,4.0,1.0,870797,2019,2,1,0.0,1.0,0.0,2.5,11.96
...,...,...,...,...,...,...,...,...,...,...,...,...
2455859,2BD992B5538,12.0,1.0,425064,2018,10,28,0.0,1.0,8.0,1.5,12.98
2455860,2C97CD72233,17.0,1.0,950422,2018,10,28,0.0,1.0,10.0,1.8,12.98
2455861,2C91C61D372,40.0,1.0,933236,2018,10,28,0.0,1.0,12.0,1.8,19.98
2455862,2CD70CFC4E3,51.0,1.0,1145776,2018,10,28,0.0,1.0,18.0,2.9,23.98


In [23]:
# Fill NaN values 

merged_df.loc[merged_df["QUALITY"].isna(), "QUALITY"] = merged_df["QUALITY"].median()
merged_df.loc[merged_df["DELIVERY_OPTION"].isna(), "DELIVERY_OPTION"] = merged_df["DELIVERY_OPTION"].max()
merged_df.loc[merged_df["MILK"].isna(), "MILK"] = merged_df["MILK"].mean()
merged_df.loc[merged_df["MEAT"].isna(), "MEAT"] = merged_df["MEAT"].mean()
merged_df.loc[merged_df["UNIT_PRICE"].isna(), "UNIT_PRICE"] = merged_df["UNIT_PRICE"].mean()

In [24]:
merged_df.isna().sum()

MAGIC_KEY          0
BOX_ID             0
BOX_COUNT          0
EN_MAGIC_KEY       0
YEAR               0
MONTH              0
DAY                0
QUALITY            0
DELIVERY_OPTION    0
MILK               0
MEAT               0
UNIT_PRICE         0
dtype: int64

In [25]:
# Are all MAGIC_KEY of problem_df present in merged_df?

problem_df["MAGIC_KEY"].isin(merged_df["MAGIC_KEY"]).all()

True

In [26]:
merged_df.head()

Unnamed: 0,MAGIC_KEY,BOX_ID,BOX_COUNT,EN_MAGIC_KEY,YEAR,MONTH,DAY,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2CED678A247,12.0,1.0,1222234,2019,2,1,0.0,1.0,8.0,1.5,12.98
1,2BF58D91BA1,12.0,1.0,526391,2019,2,1,0.0,1.0,8.0,1.5,12.98
2,2C15B86534E,99.0,1.0,624102,2019,2,1,0.0,0.0,0.0,3.3,13.96
3,2C32D9A859A,6.0,1.0,708928,2019,2,1,0.0,1.0,0.0,2.7,11.96
4,2C7A55404D1,4.0,1.0,870797,2019,2,1,0.0,1.0,0.0,2.5,11.96


In [27]:
merged_df["PURCHASE"] = merged_df["MILK"] + merged_df["MEAT"]
merged_df.loc[merged_df["PURCHASE"] > 0, "PURCHASE"] = 1.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df["PURCHASE"] = merged_df["MILK"] + merged_df["MEAT"]


In [28]:
merged_df["BOX_PRICE"] = merged_df["BOX_COUNT"] * merged_df["UNIT_PRICE"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df["BOX_PRICE"] = merged_df["BOX_COUNT"] * merged_df["UNIT_PRICE"]


In [29]:
analyze(merged_df)

Number of rows: 2455817
Number of columns: 14

                 NUNIQUE     DTYPE
MAGIC_KEY        1274094  category
BOX_ID               291   float64
BOX_COUNT             14   float64
EN_MAGIC_KEY     1274094     int32
YEAR                   2     int32
MONTH                  5     int32
DAY                   31     int32
QUALITY                2   float64
DELIVERY_OPTION        3   float64
MILK                  28   float64
MEAT                  39   float64
UNIT_PRICE            19   float64
PURCHASE               2   float64
BOX_PRICE             60   float64


In [30]:
merged_df = merged_df.drop(columns=["BOX_ID",
                                    "BOX_COUNT",
                                    "MILK", "MEAT",
                                    "UNIT_PRICE"])

merged_df = merged_df.reindex(columns=["MAGIC_KEY",
                                       "EN_MAGIC_KEY",
                                       "YEAR", "MONTH", "DAY",
                                       "QUALITY", "DELIVERY_OPTION",
                                       "BOX_PRICE", "PURCHASE"])

In [31]:
merged_df

Unnamed: 0,MAGIC_KEY,EN_MAGIC_KEY,YEAR,MONTH,DAY,QUALITY,DELIVERY_OPTION,BOX_PRICE,PURCHASE
0,2CED678A247,1222234,2019,2,1,0.0,1.0,12.98,1.0
1,2BF58D91BA1,526391,2019,2,1,0.0,1.0,12.98,1.0
2,2C15B86534E,624102,2019,2,1,0.0,0.0,13.96,1.0
3,2C32D9A859A,708928,2019,2,1,0.0,1.0,11.96,1.0
4,2C7A55404D1,870797,2019,2,1,0.0,1.0,11.96,1.0
...,...,...,...,...,...,...,...,...,...
2455859,2BD992B5538,425064,2018,10,28,0.0,1.0,12.98,1.0
2455860,2C97CD72233,950422,2018,10,28,0.0,1.0,12.98,1.0
2455861,2C91C61D372,933236,2018,10,28,0.0,1.0,19.98,1.0
2455862,2CD70CFC4E3,1145776,2018,10,28,0.0,1.0,23.98,1.0


In [32]:
# We filter out the data for only first 15 days of each month

filtered_df = merged_df[merged_df["DAY"] <= 15]

In [33]:
# Since we filtered out the data of first 15 days of each month, we no longer require DAY column

filtered_df = filtered_df.drop(columns=["DAY"])

In [34]:
filtered_df

Unnamed: 0,MAGIC_KEY,EN_MAGIC_KEY,YEAR,MONTH,QUALITY,DELIVERY_OPTION,BOX_PRICE,PURCHASE
0,2CED678A247,1222234,2019,2,0.0,1.0,12.98,1.0
1,2BF58D91BA1,526391,2019,2,0.0,1.0,12.98,1.0
2,2C15B86534E,624102,2019,2,0.0,0.0,13.96,1.0
3,2C32D9A859A,708928,2019,2,0.0,1.0,11.96,1.0
4,2C7A55404D1,870797,2019,2,0.0,1.0,11.96,1.0
...,...,...,...,...,...,...,...,...
2299516,2C60F8DFD12,808999,2018,10,0.0,0.0,15.96,1.0
2299517,2C2951E4931,680325,2018,10,0.0,0.0,19.98,1.0
2299518,2A0970C954B,330529,2018,10,1.0,0.0,10.14,1.0
2299519,2CDBF128CAC,1163441,2018,10,0.0,0.0,12.18,1.0


In [35]:
analyze(filtered_df)

Number of rows: 1240412
Number of columns: 8

                 NUNIQUE     DTYPE
MAGIC_KEY         792062  category
EN_MAGIC_KEY      792062     int32
YEAR                   2     int32
MONTH                  5     int32
QUALITY                2   float64
DELIVERY_OPTION        3   float64
BOX_PRICE             45   float64
PURCHASE               2   float64


In [36]:
# Are all MAGIC_KEY of problem_df present in filtered_df?

problem_df["MAGIC_KEY"].isin(filtered_df["MAGIC_KEY"]).all()

False

It appears that not all `MAGIC_KEY` of `problem_df` are present in `filtered_df`. It can only mean that those absent `MAGIC_KEY` never purchased anything in the first 15 days of previous 5 months.

## Model Training

In [37]:
from sklearn.model_selection import train_test_split

np.random.seed(28)

X = filtered_df.drop(columns=["MAGIC_KEY", "PURCHASE"], axis=1)
y = filtered_df["PURCHASE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         7
         1.0       1.00      1.00      1.00    372117

    accuracy                           1.00    372124
   macro avg       1.00      1.00      1.00    372124
weighted avg       1.00      1.00      1.00    372124



In [39]:
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

In [40]:
from sklearn.model_selection import RandomizedSearchCV

np.random.seed(42)

rs_model = RandomizedSearchCV(RandomForestClassifier(),
                              param_distributions=rf_grid,
                              cv=5,
                              n_iter=20,
                              verbose=True)

rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [41]:
rs_model.best_params_

{'n_estimators': 610,
 'min_samples_split': 18,
 'min_samples_leaf': 1,
 'max_depth': 5}

In [42]:
y_pred = rs_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         7
         1.0       1.00      1.00      1.00    372117

    accuracy                           1.00    372124
   macro avg       1.00      1.00      1.00    372124
weighted avg       1.00      1.00      1.00    372124

