Importing

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import metrics, model_selection

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


We will predict which products purchased in previous order will be in next order

In [3]:
orders_df=pd.read_csv("/content/drive/My Drive/Grocery/orders.csv", usecols=["order_id","user_id","order_number"])
orders_df.head()

Unnamed: 0,order_id,user_id,order_number
0,2539329,1,1
1,2398795,1,2
2,473747,1,3
3,2254736,1,4
4,431534,1,5


We will get the list of all products purchased by the customer

In [4]:
order_products__prior_df=pd.read_csv("/content/drive/My Drive/Grocery/order_products__prior.csv")

Let's merge prior with orders and get the user_id

In [5]:
order_products__prior_df=pd.merge(order_products__prior_df,orders_df,how="inner",on="order_id")

In [6]:
prior_groupd_df=order_products__prior_df.groupby("user_id")["order_number"].aggregate("max").reset_index()

In [7]:
prior_df_new = pd.merge(order_products__prior_df, prior_groupd_df, how="inner", on=["user_id", "order_number"])

In [8]:
prior_df_new=prior_df_new[["user_id","product_id","reordered"]]

In [9]:
prior_df_new.columns=["user_id","product_id","reordered_latest"]
prior_df_new.head()

Unnamed: 0,user_id,product_id,reordered_latest
0,59897,9755,1
1,59897,31487,0
2,59897,37510,1
3,59897,14576,1
4,59897,22105,0


Let's get the total count of each product and numbr of re-orders by customer

In [10]:
order_products__prior_df = order_products__prior_df.groupby(["user_id","product_id"])["reordered"].aggregate(["count", "sum"]).reset_index()

In [11]:
order_products__prior_df.columns=["user_id","product_id","reordered_cnt","reordered_sum"]

Merge prior with new

In [12]:
order_products__prior_df=pd.merge(order_products__prior_df,prior_df_new,how='left',on=['user_id','product_id'])
order_products__prior_df.head()

Unnamed: 0,user_id,product_id,reordered_cnt,reordered_sum,reordered_latest
0,1,196,10,9,1.0
1,1,10258,9,8,1.0
2,1,10326,1,0,
3,1,12427,10,9,1.0
4,1,13032,3,2,1.0


Let's now read the train and sample submission data and merge with order data to get userid for orderid

In [13]:
orders_df.drop(["order_number"],axis=1,inplace=True)

In [14]:
order_products__train_df=pd.read_csv("/content/drive/My Drive/Grocery/order_products__train.csv",usecols= ['order_id'])

In [15]:
order_products__train_df=order_products__train_df.groupby('order_id').agg("count").reset_index()

In [16]:
sample_submission_df=pd.read_csv("/content/drive/My Drive/Grocery/sample_submission.csv",usecols= ['order_id'])

In [17]:
order_products__train_df = pd.merge(order_products__train_df, orders_df, how="inner", on="order_id")

In [18]:
sample_submission_df = pd.merge(sample_submission_df, orders_df, how="inner", on="order_id")

In [19]:
test_df=sample_submission_df

In [20]:
print(order_products__train_df.shape, test_df.shape)

(131209, 2) (75000, 2)


Now we will merge train and test(sample submission) data with prior data to get products purchase history previously by custmr.

In [21]:
order_products__train_df = pd.merge(order_products__train_df, order_products__prior_df, how="inner", on="user_id")
test_df = pd.merge(test_df, order_products__prior_df, how="inner", on="user_id")
del order_products__prior_df, prior_groupd_df, prior_df_new
print(order_products__train_df.shape, test_df.shape)

(8474661, 6) (4833292, 6)


We will now merge train and test on products data as it has department name and aisle

In [22]:
products_df=pd.read_csv("/content/drive/My Drive/Grocery/products.csv", usecols=["product_id", "aisle_id", "department_id"])
order_products__train_df = pd.merge(order_products__train_df, products_df, how="inner", on="product_id")
test_df = pd.merge(test_df, products_df, how="inner", on="product_id")
del products_df
print(order_products__train_df.shape, test_df.shape)

(8474661, 8) (4833292, 8)


So, we have all products purchased by customer with necessary features.  Now we will use train data to find if product has been re-ordered in next order

In [23]:
order_products__train_y_df=pd.read_csv("/content/drive/My Drive/Grocery/order_products__train.csv",usecols = [ 'order_id' ,'product_id','reordered'])

In [24]:
order_products__train_y_df = pd.merge(order_products__train_y_df, orders_df, how="inner", on="order_id")

In [25]:
order_products__train_y_df = order_products__train_y_df[["user_id", "product_id", "reordered"]]

In [26]:
order_products__train_df=pd.merge(order_products__train_df,order_products__train_y_df,how='left',on = ['user_id','product_id' ])
order_products__train_df[ 'reordered' ].fillna(0, inplace = True)

In [27]:
print(order_products__train_df.shape)

(8474661, 9)


In [28]:
del order_products__train_y_df

Target variable for train set

In [29]:
order_products__train_y_df=order_products__train_df.reordered.values

In [30]:
train_y=order_products__train_y_df

df for test-set prediction

In [31]:
test_set_df=test_df[['order_id','product_id']]

now we will drop the un-necessary columns

In [32]:
order_products__train_df = np.array(order_products__train_df.drop(["order_id", "user_id", "reordered"], axis=1))
test_df = np.array(test_df.drop(["order_id", "user_id"], axis=1))
print(order_products__train_df.shape, test_df.shape)

(8474661, 6) (4833292, 6)


In [33]:
train_df=order_products__train_df

Define function to run XGBoost Model

In [34]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0):
        params={}
        params['objective']='binary:logistic'
        params["eval_metric"]="logloss"
        params['eta']=0.05
        params['subsample']=0.7
        params['min_child_weight']=10
        params['colsample_bytree']=0.7
        params['max_depth']=8
        params['silent']=1
        params['seed']=seed_val
        num_rounds=100
        plst=list(params.items())
        xgtrain=xgb.DMatrix(train_X,label = train_y)

        if test_y is not None:
                xgtest=xgb.DMatrix(test_X, label = test_y)
                watchlist=[(xgtrain,"train"),(xgtest,"test") ]
                model=xgb.train(plst,xgtrain,num_rounds,watchlist,early_stopping_rounds=50,verbose_eval=10)
        else:
                xgtest=xgb.DMatrix(test_X)
                model=xgb.train(plst,xgtrain,num_rounds)

        pred_test_y=model.predict(xgtest)
        return pred_test_y

Let us run the XGB Model We use cut-off value toget prediction


In [35]:
pred=runXGB(train_df,train_y,test_df)
del train_df,test_df
cutoff = 0.2
pred[pred>=cutoff] = 1
pred[pred<cutoff] = 0
test_set_df["Pred"] = pred
test_set_df = test_set_df.loc[test_set_df["Pred"].astype('int')==1]

Now we will merge which has more than 1 product to a single string

In [36]:
def merge_products(x):
    return " ".join(list(x.astype('str')))
test_set_df=test_set_df.groupby("order_id")["product_id"].agg(merge_products).reset_index()
test_set_df.columns=["order_id", "products"]

In [37]:
sbmt_df=pd.read_csv('/content/drive/My Drive/Grocery/sample_submission.csv',usecols=['order_id'])
sbmt_df=pd.merge(sbmt_df,test_set_df,how='left',on='order_id')

when no preidctions fill it with none

In [38]:
sbmt_df["products"].fillna("None",inplace=True)

In [39]:
sbmt_df.head()

Unnamed: 0,order_id,products
0,17,13107 21463
1,34,47766 2596 13176 16083 39180 39475 44632 44663...
2,137,5134 2326 23794 24852 25890 29594 41787
3,182,21903 39275 9337 27104 5479 13629 32109 33000 ...
4,257,39475 24852 27104 49235 27966 29837 30233 3573...


In [40]:
sbmt_df.to_csv("xgb_submission_file.csv", index=False)