# PROBLEM 3
“problem 3.csv” contains the Magic Keys of customers who purchased at least one box of milk and/or meat in the first 15 days of March-2019. You need to predict what quantity of meat were be purchased by them in this period. 

In [40]:
import pandas as pd
import numpy as np

## Preprocessing

In [41]:
purchase_df = pd.read_csv("purchase.csv")
boxes_df = pd.read_csv("boxes.csv")
problem3_df = pd.read_csv("problem 3.csv")

In [42]:
# Utility Function

def analyze(df):
    print(f"Number of rows: {len(df)}")    
    print(f"Number of columns: {len(df.columns)}")
    
    info_df = pd.DataFrame({
        "NUNIQUE": df.nunique(),
        "DTYPE": df.dtypes
    })
    
    print()
    print(info_df)

In [43]:
merged_df = purchase_df.merge(boxes_df, on="BOX_ID", how="left")
merged_df.head(10)

Unnamed: 0,PURCHASE_DATE,MAGIC_KEY,BOX_ID,BOX_COUNT,QUALITY,DELIVERY_OPTION,MILK,MEAT,UNIT_PRICE
0,1/2/2019,2CED678A247,12.0,1.0,Premium,Home Delivery - CoD,8.0,1.5,12.98
1,1/2/2019,2BF58D91BA1,12.0,1.0,Premium,Home Delivery - CoD,8.0,1.5,12.98
2,1/2/2019,2C15B86534E,99.0,1.0,Premium,Delivery from Collection Point,0.0,3.3,13.96
3,1/2/2019,2C32D9A859A,6.0,1.0,Premium,Home Delivery - CoD,0.0,2.7,11.96
4,1/2/2019,2C7A55404D1,4.0,1.0,Premium,Home Delivery - CoD,0.0,2.5,11.96
5,1/2/2019,29D969045C2,238.0,1.0,Standard,Delivery from Collection Point,10.7,0.0,12.78
6,1/2/2019,28E5EA49074,227.0,1.0,Standard,Delivery from Collection Point,8.1,0.0,9.96
7,1/2/2019,2CEFA3A8659,6.0,1.0,Premium,Home Delivery - CoD,0.0,2.7,11.96
8,1/2/2019,2A00DE30F46,204.0,1.0,Standard,Home Delivery - CoD,10.7,0.0,12.78
9,1/2/2019,291C04B5CBF,231.0,1.0,Standard,Delivery from Collection Point,8.5,0.0,10.14


In [44]:
merged_df["PURCHASE_DATE"] = pd.to_datetime(merged_df['PURCHASE_DATE'],format='%d/%m/%Y')
merged_df['DAY'] = merged_df['PURCHASE_DATE'].dt.day
merged_df['MONTH'] = merged_df['PURCHASE_DATE'].dt.month
merged_df['YEAR'] = merged_df['PURCHASE_DATE'].dt.year

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

merged_df["QUALITY"] = le.fit_transform(merged_df["QUALITY"])
merged_df["DELIVERY_OPTION"] = le.fit_transform(merged_df["DELIVERY_OPTION"])
merged_df["ALT_MAGIC_KEY"] = le.fit_transform(merged_df["MAGIC_KEY"])

In [46]:
merged_df.drop(columns=["PURCHASE_DATE"], inplace=True)
merged_df = merged_df.reindex(columns=["MAGIC_KEY", "ALT_MAGIC_KEY",
                                       "YEAR", "MONTH", "DAY",
                                       "MILK", "MEAT",
                                       "BOX_COUNT", "UNIT_PRICE",
                                       "QUALITY", "DELIVERY_OPTION",
                                       "BOX_ID"])

In [47]:
analyze(merged_df)

Number of rows: 2455864
Number of columns: 12

                 NUNIQUE    DTYPE
MAGIC_KEY        1274108   object
ALT_MAGIC_KEY    1274108    int32
YEAR                   2    int32
MONTH                  5    int32
DAY                   31    int32
MILK                  28  float64
MEAT                  39  float64
BOX_COUNT             14  float64
UNIT_PRICE            18  float64
QUALITY                3    int32
DELIVERY_OPTION        4    int32
BOX_ID               291  float64


In [48]:
merged_df

Unnamed: 0,MAGIC_KEY,ALT_MAGIC_KEY,YEAR,MONTH,DAY,MILK,MEAT,BOX_COUNT,UNIT_PRICE,QUALITY,DELIVERY_OPTION,BOX_ID
0,2CED678A247,1222234,2019,2,1,8.0,1.5,1.0,12.98,0,1,12.0
1,2BF58D91BA1,526391,2019,2,1,8.0,1.5,1.0,12.98,0,1,12.0
2,2C15B86534E,624102,2019,2,1,0.0,3.3,1.0,13.96,0,0,99.0
3,2C32D9A859A,708928,2019,2,1,0.0,2.7,1.0,11.96,0,1,6.0
4,2C7A55404D1,870797,2019,2,1,0.0,2.5,1.0,11.96,0,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2455859,2BD992B5538,425064,2018,10,28,8.0,1.5,1.0,12.98,0,1,12.0
2455860,2C97CD72233,950422,2018,10,28,10.0,1.8,1.0,12.98,0,1,17.0
2455861,2C91C61D372,933236,2018,10,28,12.0,1.8,1.0,19.98,0,1,40.0
2455862,2CD70CFC4E3,1145776,2018,10,28,18.0,2.9,1.0,23.98,0,1,51.0


In [49]:
merged_df.isna().sum()

MAGIC_KEY           0
ALT_MAGIC_KEY       0
YEAR                0
MONTH               0
DAY                 0
MILK               64
MEAT               64
BOX_COUNT          47
UNIT_PRICE         64
QUALITY             0
DELIVERY_OPTION     0
BOX_ID             47
dtype: int64

In [50]:
merged_df.fillna({'MILK': 0,
                  'MEAT': 0,
                  'BOX_COUNT': 0,
                  'UNIT_PRICE': 0,
                  'BOX_ID': 0},
                  inplace=True)

In [51]:
merged_df.isna().sum()

MAGIC_KEY          0
ALT_MAGIC_KEY      0
YEAR               0
MONTH              0
DAY                0
MILK               0
MEAT               0
BOX_COUNT          0
UNIT_PRICE         0
QUALITY            0
DELIVERY_OPTION    0
BOX_ID             0
dtype: int64

In [52]:
analyze(merged_df)

Number of rows: 2455864
Number of columns: 12

                 NUNIQUE    DTYPE
MAGIC_KEY        1274108   object
ALT_MAGIC_KEY    1274108    int32
YEAR                   2    int32
MONTH                  5    int32
DAY                   31    int32
MILK                  28  float64
MEAT                  39  float64
BOX_COUNT             15  float64
UNIT_PRICE            19  float64
QUALITY                3    int32
DELIVERY_OPTION        4    int32
BOX_ID               292  float64


In [53]:
merged_df["BOX_COUNT"].value_counts()

BOX_COUNT
 1.0     2453363
 2.0        2283
 3.0          85
 0.0          47
 4.0          30
-1.0          17
 6.0          17
 5.0          10
 7.0           3
 9.0           2
 19.0          2
 11.0          2
 8.0           1
 10.0          1
 13.0          1
Name: count, dtype: int64

In [55]:
merged_problem3 = merged_df.merge(problem3_df, on='MAGIC_KEY', how='inner')
merged_problem3[merged_problem3['BOX_COUNT'] == 0]

Unnamed: 0,MAGIC_KEY,ALT_MAGIC_KEY,YEAR,MONTH,DAY,MILK,MEAT,BOX_COUNT,UNIT_PRICE,QUALITY,DELIVERY_OPTION,BOX_ID
2,290D33249B7,183987,2019,2,1,0.0,0.0,0.0,0.0,2,3,0.0
3306,290D33249B7,183987,2019,2,20,0.0,0.0,0.0,0.0,2,3,0.0
4429,290D33249B7,183987,2019,2,25,0.0,0.0,0.0,0.0,2,3,0.0
8690,290D33249B7,183987,2018,12,3,0.0,0.0,0.0,0.0,2,3,0.0


In [37]:
merged_df = merged_df[merged_df["BOX_COUNT"] < 0.0]

In [38]:
problem3_df["MAGIC_KEY"].isin(merged_df["MAGIC_KEY"]).all()

False

In [47]:
merged_df.isna().sum()

MAGIC_KEY          0
ALT_MAGIC_KEY      0
YEAR               0
MONTH              0
DAY                0
MILK               0
MEAT               0
BOX_COUNT          0
UNIT_PRICE         0
QUALITY            0
DELIVERY_OPTION    0
BOX_ID             0
dtype: int64

In [48]:
filtered_df = merged_df[merged_df["BOX_COUNT"] >= 1]
filtered_df = merged_df.drop_duplicates(subset=["MAGIC_KEY"])

In [33]:
problem3_df["MAGIC_KEY"].isin(filtered_df["MAGIC_KEY"]).all()

False