In [1]:
import pandas as pd

articles_df = pd.read_csv("articles.csv")
customers_df = pd.read_csv("customers.csv")
trans_df = pd.read_csv("transactions_train.csv", parse_dates=["t_dat"])

In [2]:
articles_df.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [3]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [4]:
trans_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [5]:
print(f"There are {len(trans_df):,} transactions in total.")

There are 31,788,324 transactions in total.


We can see that we have a large dataset. For the sake of the tutorial we will use a small subset of this dataset, which we generate by sampling 100'000 customers and using their transactions.

In [6]:
# Sample users.

N_USERS = 100_000

# customers_df.dropna(inplace=True)

customer_subset_df = customers_df.sample(N_USERS, random_state=27)

df = trans_df.merge(customer_subset_df, on="customer_id").merge(articles_df, on="article_id")
df.dropna(inplace=True) # E.g. age has null values

df.rename(columns={"customer_id" : "user_id", "article_id" : "item_id"}, inplace=True)

We also want to encode the time of the year the purchase was made, as there should be a strong seasonality associated with purchases. We'll use the month, but encode it using sin and cos, since it's cyclical. 

In [7]:
import numpy as np

# Encode month using sin and cos.
month = df["t_dat"].apply(lambda x : x.month - 1) # Map month to range [0,11].
C = 2*np.pi/12
df["month_sin"] = np.sin(month*C)
df["month_cos"] = np.cos(month*C)

Here we do a simple random split. In a real-life setting it would make more sense with a chronological split, as we would be concerned with what the customer could be buying in the future.

In [8]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.3, random_state=27)

In [9]:
train_df.to_csv("train_df.csv", index=False)
val_df.to_csv("val_df.csv", index=False)