# PROBLEM 1
We need to predict which of the Magic Keys given in “problem 1.csv” will buy milk and/or meat in the first 15 days of March-2019. We have to put Y in the purchase column if the Magic Keys will purchase and N if the Magic Keys will not make a purchase. 

## Data Preprocessing

In [26]:
import pandas as pd

purchase_df = pd.read_csv("purchase.csv")
boxes_df = pd.read_csv("boxes.csv")

In [27]:
def analyze(df):
    print(f"Number of rows: {len(df)}")    
    print(f"Number of columns: {len(df.columns)}")
    
    info_df = pd.DataFrame({
        "NUNIQUE": df.nunique(),
        "DTYPE": df.dtypes
    })
    
    print()
    print(info_df)

### Preprocessing `purchase_df`  

In [28]:
purchase_df.head()

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT
0,1/2/2019,2CED678A247,12.0,1.0
1,1/2/2019,2BF58D91BA1,12.0,1.0
2,1/2/2019,2C15B86534E,99.0,1.0
3,1/2/2019,2C32D9A859A,6.0,1.0
4,1/2/2019,2C7A55404D1,4.0,1.0


In [29]:
analyze(purchase_df)

Number of rows: 2455864
Number of columns: 4

               NUNIQUE    DTYPE
PURCHASE_DATE      151   object
MAGIC_KEY      1274108   object
BOX_ID             291  float64
BOX_COUNT           14  float64


In [30]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

purchase_df["MAGIC_KEY"] = purchase_df["MAGIC_KEY"].astype("category")
purchase_df["EN_MAGIC_KEY"] = le.fit_transform(purchase_df["MAGIC_KEY"])

In [31]:
purchase_df["PURCHASE_DATE"] = pd.to_datetime(purchase_df["PURCHASE_DATE"],
                                              format="%d/%m/%Y")

purchase_df["YEAR"] = purchase_df["PURCHASE_DATE"].dt.year
purchase_df["MONTH"] = purchase_df["PURCHASE_DATE"].dt.month
purchase_df["DAY"] = purchase_df["PURCHASE_DATE"].dt.day

purchase_df = purchase_df.drop(columns=["PURCHASE_DATE"])

In [32]:
purchase_df = purchase_df.reindex(columns=["MAGIC_KEY",
                                           "EN_MAGIC_KEY",
                                           "YEAR", "MONTH", "DAY",
                                           "BOX_COUNT", "BOX_ID"])

In [33]:
purchase_df.head()

Unnamed: 0,MAGIC_KEY,EN_MAGIC_KEY,YEAR,MONTH,DAY,BOX_COUNT,BOX_ID
0,2CED678A247,1222234,2019,2,1,1.0,12.0
1,2BF58D91BA1,526391,2019,2,1,1.0,12.0
2,2C15B86534E,624102,2019,2,1,1.0,99.0
3,2C32D9A859A,708928,2019,2,1,1.0,6.0
4,2C7A55404D1,870797,2019,2,1,1.0,4.0


In [34]:
analyze(purchase_df)

Number of rows: 2455864
Number of columns: 7

              NUNIQUE     DTYPE
MAGIC_KEY     1274108  category
EN_MAGIC_KEY  1274108     int32
YEAR                2     int32
MONTH               5     int32
DAY                31     int32
BOX_COUNT          14   float64
BOX_ID            291   float64


### Preprocessing `boxes_df`

In [35]:
boxes_df.head()

Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,1,Premium,Home Delivery - CoD,0.0,2.7,9.96
1,2,Premium,Home Delivery - CoD,0.0,2.3,11.96
2,3,Premium,Home Delivery - CoD,0.0,2.4,11.96
3,4,Premium,Home Delivery - CoD,0.0,2.5,11.96
4,5,Premium,Home Delivery - CoD,0.0,2.6,11.96


In [36]:
analyze(boxes_df)

Number of rows: 290
Number of columns: 6

                 NUNIQUE    DTYPE
BOX_ID               290    int64
QUALITY                2   object
DELIVERY_OPTION        3   object
MILK                  28  float64
MEAT                  39  float64
UNIT_PRICE            18  float64


In [37]:
boxes_df["QUALITY"] = le.fit_transform(boxes_df["QUALITY"])
boxes_df["DELIVERY_OPTION"] = le.fit_transform(boxes_df["DELIVERY_OPTION"])

In [38]:
boxes_df.head()

Unnamed: 0,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,1,0,1,0.0,2.7,9.96
1,2,0,1,0.0,2.3,11.96
2,3,0,1,0.0,2.4,11.96
3,4,0,1,0.0,2.5,11.96
4,5,0,1,0.0,2.6,11.96


In [39]:
analyze(boxes_df)

Number of rows: 290
Number of columns: 6

                 NUNIQUE    DTYPE
BOX_ID               290    int64
QUALITY                2    int32
DELIVERY_OPTION        3    int32
MILK                  28  float64
MEAT                  39  float64
UNIT_PRICE            18  float64


## Feature Engineering

In [40]:
merged_df = pd.merge(purchase_df, boxes_df, on="BOX_ID", how="left")

In [41]:
merged_df.head()

Unnamed: 0,MAGIC_KEY,EN_MAGIC_KEY,YEAR,MONTH,DAY,BOX_COUNT,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2CED678A247,1222234,2019,2,1,1.0,12.0,0.0,1.0,8.0,1.5,12.98
1,2BF58D91BA1,526391,2019,2,1,1.0,12.0,0.0,1.0,8.0,1.5,12.98
2,2C15B86534E,624102,2019,2,1,1.0,99.0,0.0,0.0,0.0,3.3,13.96
3,2C32D9A859A,708928,2019,2,1,1.0,6.0,0.0,1.0,0.0,2.7,11.96
4,2C7A55404D1,870797,2019,2,1,1.0,4.0,0.0,1.0,0.0,2.5,11.96


In [42]:
analyze(merged_df)

Number of rows: 2455864
Number of columns: 12

                 NUNIQUE     DTYPE
MAGIC_KEY        1274108  category
EN_MAGIC_KEY     1274108     int32
YEAR                   2     int32
MONTH                  5     int32
DAY                   31     int32
BOX_COUNT             14   float64
BOX_ID               291   float64
QUALITY                2   float64
DELIVERY_OPTION        3   float64
MILK                  28   float64
MEAT                  39   float64
UNIT_PRICE            18   float64


In [43]:
filtered_df = merged_df[merged_df["DAY"] <= 15]

In [44]:
filtered_df.head()

Unnamed: 0,MAGIC_KEY,EN_MAGIC_KEY,YEAR,MONTH,DAY,BOX_COUNT,BOX_ID,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,2CED678A247,1222234,2019,2,1,1.0,12.0,0.0,1.0,8.0,1.5,12.98
1,2BF58D91BA1,526391,2019,2,1,1.0,12.0,0.0,1.0,8.0,1.5,12.98
2,2C15B86534E,624102,2019,2,1,1.0,99.0,0.0,0.0,0.0,3.3,13.96
3,2C32D9A859A,708928,2019,2,1,1.0,6.0,0.0,1.0,0.0,2.7,11.96
4,2C7A55404D1,870797,2019,2,1,1.0,4.0,0.0,1.0,0.0,2.5,11.96


In [45]:
analyze(filtered_df)

Number of rows: 1240444
Number of columns: 12

                 NUNIQUE     DTYPE
MAGIC_KEY         792076  category
EN_MAGIC_KEY      792076     int32
YEAR                   2     int32
MONTH                  5     int32
DAY                   15     int32
BOX_COUNT              5   float64
BOX_ID               257   float64
QUALITY                2   float64
DELIVERY_OPTION        3   float64
MILK                  26   float64
MEAT                  36   float64
UNIT_PRICE            18   float64


In [46]:
filtered_df["PURCHASE"] = filtered_df.apply(lambda row: True if row["MILK"] >= 0 or row["MEAT"] >= 0 else False, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["PURCHASE"] = filtered_df.apply(lambda row: True if row["MILK"] >= 0 or row["MEAT"] >= 0 else False, axis=1)


In [47]:
# filtered_df = filtered_df.drop(columns=["MILK", "MEAT", "BOX_ID"])

filtered_df = filtered_df.reindex(columns=["MAGIC_KEY", "EN_MAGIC_KEY",
                                       "YEAR", "MONTH", "DAY",
                                       "BOX_COUNT", "DELIVERY_OPTION",
                                       "QUALITY", "UNIT_PRICE",
                                       "MILK", "MEAT", "PURCHASE"])

In [49]:
filtered_df

Unnamed: 0,MAGIC_KEY,EN_MAGIC_KEY,YEAR,MONTH,DAY,BOX_COUNT,DELIVERY_OPTION,QUALITY,UNIT_PRICE,MILK,MEAT,PURCHASE
0,2CED678A247,1222234,2019,2,1,1.0,1.0,0.0,12.98,8.0,1.5,True
1,2BF58D91BA1,526391,2019,2,1,1.0,1.0,0.0,12.98,8.0,1.5,True
2,2C15B86534E,624102,2019,2,1,1.0,0.0,0.0,13.96,0.0,3.3,True
3,2C32D9A859A,708928,2019,2,1,1.0,1.0,0.0,11.96,0.0,2.7,True
4,2C7A55404D1,870797,2019,2,1,1.0,1.0,0.0,11.96,0.0,2.5,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2299516,2C60F8DFD12,808999,2018,10,14,1.0,0.0,0.0,15.96,0.0,3.3,True
2299517,2C2951E4931,680325,2018,10,14,1.0,0.0,0.0,19.98,10.0,1.8,True
2299518,2A0970C954B,330529,2018,10,14,1.0,0.0,1.0,10.14,8.5,0.0,True
2299519,2CDBF128CAC,1163441,2018,10,14,1.0,0.0,0.0,12.18,10.0,0.0,True


In [50]:
filtered_df[filtered_df["PURCHASE"] == False]

Unnamed: 0,MAGIC_KEY,EN_MAGIC_KEY,YEAR,MONTH,DAY,BOX_COUNT,DELIVERY_OPTION,QUALITY,UNIT_PRICE,MILK,MEAT,PURCHASE
177,28FF265F082,146179,2019,2,1,,,,,,,False
178,2CB168CDFA3,1029966,2019,2,1,,,,,,,False
179,2C2C8844F09,689457,2019,2,1,,,,,,,False
180,2BDCCEF05A4,437086,2019,2,1,,,,,,,False
181,2BF1D98D0B2,514526,2019,2,1,,,,,,,False
182,2BDC3619EAF,434755,2019,2,1,,,,,,,False
183,290D33249B7,183987,2019,2,1,,,,,,,False
184,2BE266F8C55,460626,2019,2,1,,,,,,,False
185,2903192D056,156323,2019,2,1,,,,,,,False
186,2BE34886311,464138,2019,2,1,,,,,,,False


In [76]:
analyze(filtered_df)

Number of rows: 1240444
Number of columns: 10

                 NUNIQUE     DTYPE
MAGIC_KEY         792076  category
EN_MAGIC_KEY      792076     int32
YEAR                   2     int32
MONTH                  5     int32
DAY                   15     int32
BOX_COUNT              5   float64
DELIVERY_OPTION        3   float64
QUALITY                2   float64
UNIT_PRICE            18   float64
PURCHASE               2      bool


## Model Training

In [77]:
from sklearn.model_selection import train_test_split

X = filtered_df.drop(columns=["MAGIC_KEY", "PURCHASE"])
y = filtered_df["PURCHASE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [78]:
analyze(X)

Number of rows: 1240444
Number of columns: 8

                 NUNIQUE    DTYPE
EN_MAGIC_KEY      792076    int32
YEAR                   2    int32
MONTH                  5    int32
DAY                   15    int32
BOX_COUNT              5  float64
DELIVERY_OPTION        3  float64
QUALITY                2  float64
UNIT_PRICE            18  float64


In [79]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

### Evaluation

In [80]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

In [81]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         3
        True       1.00      1.00      1.00    248086

    accuracy                           1.00    248089
   macro avg       1.00      1.00      1.00    248089
weighted avg       1.00      1.00      1.00    248089



## Prediction

In [82]:
problem_df = pd.read_csv("problem 1.csv")
analyze(problem_df)

Number of rows: 58689
Number of columns: 1

           NUNIQUE   DTYPE
MAGIC_KEY    58689  object


In [83]:
problem_df["MAGIC_KEY"] = problem_df["MAGIC_KEY"].astype("category")
analyze(problem_df)

Number of rows: 58689
Number of columns: 1

           NUNIQUE     DTYPE
MAGIC_KEY    58689  category


In [84]:
analyze(filtered_df)

Number of rows: 1240444
Number of columns: 10

                 NUNIQUE     DTYPE
MAGIC_KEY         792076  category
EN_MAGIC_KEY      792076     int32
YEAR                   2     int32
MONTH                  5     int32
DAY                   15     int32
BOX_COUNT              5   float64
DELIVERY_OPTION        3   float64
QUALITY                2   float64
UNIT_PRICE            18   float64
PURCHASE               2      bool


In [85]:
deep_filtered_df = filtered_df.drop_duplicates(subset='MAGIC_KEY')
prediction_df = problem_df.merge(deep_filtered_df, on='MAGIC_KEY', how='left')
prediction_df = prediction_df.drop(columns=['PURCHASE'])

In [86]:
prediction_df["YEAR"] = 2019
prediction_df["MONTH"] = 3
prediction_df["DAY"] = 1

In [87]:
analyze(prediction_df)

Number of rows: 58689
Number of columns: 9

                 NUNIQUE    DTYPE
MAGIC_KEY          58689   object
EN_MAGIC_KEY       47899  float64
YEAR                   1    int64
MONTH                  1    int64
DAY                    1    int64
BOX_COUNT              3  float64
DELIVERY_OPTION        3  float64
QUALITY                2  float64
UNIT_PRICE            14  float64


In [88]:
predicted_purchases = model.predict(prediction_df.drop(columns=["MAGIC_KEY"]))

submission_df = pd.DataFrame({
    'MAGIC_KEY': prediction_df["MAGIC_KEY"],
    'PURCHASE': ['Y' if p else 'N' for p in predicted_purchases]
})

In [89]:
submission_df

Unnamed: 0,MAGIC_KEY,PURCHASE
0,28D5BB06356,N
1,293BEAB4E98,Y
2,2962EE8065C,N
3,2957BE29EA9,Y
4,28E351A0745,N
...,...,...
58684,28FB7C09776,Y
58685,28E0E3B69BF,Y
58686,28D343103A7,Y
58687,290B1D6D5CB,Y
