# Preprocess data

## Initialize

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
data_path = '../Data/Raw/'


## Load data

In [3]:
articles_df = pd.read_csv(data_path+'articles.csv')
customer_df = pd.read_csv(data_path+'customers.csv')
transactions_df = pd.read_csv(data_path+'transactions_train.csv')

## Explore data

In [4]:
# Preprocess customer dataframe
# check for NaN and make a subset with relevant columns
percent_c = (customer_df.isnull().sum()/customer_df.isnull().count()*100).sort_values(ascending = False)
df_c = customer_df[['customer_id','age', 'club_member_status']]
df_c.dropna(subset=['age'])



Unnamed: 0,customer_id,age,club_member_status
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,49.0,ACTIVE
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,25.0,ACTIVE
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,24.0,ACTIVE
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,54.0,ACTIVE
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,52.0,ACTIVE
...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,24.0,ACTIVE
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,21.0,ACTIVE
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,21.0,ACTIVE
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,18.0,ACTIVE


In [5]:
# Preprocess article dataframe
# subset relevant columns
df_a = articles_df[['article_id','product_type_no', 'prod_name', 'product_type_name', 'product_group_name', 'colour_group_name', 'department_name', 'section_name']]
# check for NaN
percent_a = (df_a.isnull().sum()/df_a.isnull().count()*100).sort_values(ascending = False)

## Preprocess Data

In [6]:
# Preprocess transaction train dataframe
#datetime and create a month column
transactions_df.t_dat = pd.to_datetime(transactions_df.t_dat)
transactions_df['month'] =  pd.DatetimeIndex(transactions_df['t_dat']).month
transactions_df['year'] =  pd.DatetimeIndex(transactions_df['t_dat']).year
transactions_df['day'] =  pd.DatetimeIndex(transactions_df['t_dat']).day



transactions_df.loc[(transactions_df['month']>= 1) & (transactions_df['month'] <=2), 'season'] = 'Winter'
transactions_df.loc[(transactions_df['month'] == 12), 'season'] = 'Winter' 
transactions_df.loc[(transactions_df['month'] >= 3) & (transactions_df['month'] <=5), 'season'] = 'Spring' 
transactions_df.loc[(transactions_df['month'] >= 6) & (transactions_df['month'] <=8),'season'] = 'Summer' 
transactions_df.loc[(transactions_df['month'] >= 9) & (transactions_df['month'] <=11), 'season'] = 'Autumn' 

In [7]:
transactions_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,month,year,day,season
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,9,2018,20,Autumn
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,9,2018,20,Autumn
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,9,2018,20,Autumn
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,9,2018,20,Autumn
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,9,2018,20,Autumn


In [8]:
df_a.head()

Unnamed: 0,article_id,product_type_no,prod_name,product_type_name,product_group_name,colour_group_name,department_name,section_name
0,108775015,253,Strap top,Vest top,Garment Upper body,Black,Jersey Basic,Womens Everyday Basics
1,108775044,253,Strap top,Vest top,Garment Upper body,White,Jersey Basic,Womens Everyday Basics
2,108775051,253,Strap top (1),Vest top,Garment Upper body,Off White,Jersey Basic,Womens Everyday Basics
3,110065001,306,OP T-shirt (Idro),Bra,Underwear,Black,Clean Lingerie,Womens Lingerie
4,110065002,306,OP T-shirt (Idro),Bra,Underwear,White,Clean Lingerie,Womens Lingerie


In [9]:
X = transactions_df.iloc[0:20000].merge(df_a, how = "left", on = "article_id")

In [10]:
X.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,month,year,day,season,product_type_no,prod_name,product_type_name,product_group_name,colour_group_name,department_name,section_name
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,9,2018,20,Autumn,283,Atlanta Push Body Harlow,Underwear body,Underwear,Black,Expressive Lingerie,Womens Lingerie
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,9,2018,20,Autumn,306,Rae Push (Melbourne) 2p,Bra,Underwear,Light Pink,Casual Lingerie,Womens Lingerie
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,9,2018,20,Autumn,252,Inca Jumper,Sweater,Garment Upper body,Pink,Tops Knitwear DS,Divided Selected
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,9,2018,20,Autumn,252,W YODA KNIT OL OFFER,Sweater,Garment Upper body,Pink,Campaigns,Womens Everyday Collection
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,9,2018,20,Autumn,252,W YODA KNIT OL OFFER,Sweater,Garment Upper body,Dark Green,Campaigns,Womens Everyday Collection


In [11]:
X['product_type_name'].nunique()

82

In [12]:
X['product_group_name'].nunique()

13

In [13]:
X['colour_group_name'].nunique()

50

In [14]:
X['department_name'].nunique()

209

In [15]:
X['section_name'].nunique()

54

In [16]:
X = X.drop(['t_dat','product_type_no','prod_name','colour_group_name','department_name'], axis=1)

In [17]:
X.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,month,year,day,season,product_type_name,product_group_name,section_name
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,9,2018,20,Autumn,Underwear body,Underwear,Womens Lingerie
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,9,2018,20,Autumn,Bra,Underwear,Womens Lingerie
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,9,2018,20,Autumn,Sweater,Garment Upper body,Divided Selected
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,9,2018,20,Autumn,Sweater,Garment Upper body,Womens Everyday Collection
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,9,2018,20,Autumn,Sweater,Garment Upper body,Womens Everyday Collection


In [18]:
X = pd.get_dummies(X, columns = ['season','product_type_name','product_group_name','section_name'])

In [19]:
X.shape

(20000, 157)

In [20]:
X = X.merge(df_c[['customer_id','age']], how = 'left', on = 'customer_id')

In [21]:
X['age']

0        24.0
1        24.0
2        32.0
3        32.0
4        32.0
         ... 
19995    23.0
19996    23.0
19997    23.0
19998    52.0
19999    52.0
Name: age, Length: 20000, dtype: float64

In [22]:
splitrange = round(0.75*len(transactions_df['customer_id']))
splitrange2 = round(0.95*len(transactions_df['customer_id']))

train = X.iloc[:splitrange]
valid = X.iloc[splitrange+1:splitrange2]
test = X.iloc[splitrange2:]

## Now we need to define customer og articles so i match with the features:

In [25]:
#df_a_sub = df_a.drop([])
articles_sub = articles_df[['article_id']].values.flatten()
customers_sub = customer_df[['customer_id']].values.flatten()

In [26]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, customers_sub, articles_sub, embedding_dim):
        super(SimpleRecommender, self).__init__()
        self.articles = tf.constant(articles_sub, dtype=tf.int32)
        self.customers = tf.constant(customers_sub, dtype=tf.string)

        self.article_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.articles, range(len(articles_sub))),-1)
        self.customer_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.customers, range(len(customers_sub))),-1)

        self.customer_embed = tf.keras.layers.Embedding(len(customers_sub),embedding_dim)

        self.articles_embed = tf.keras.layers.Embedding(len(articles_sub),embedding_dim)

        self.dot = tf.keras.layers.Dot(axes=-1)

    def call(self, inputs):
        user = inputs[0]
        article = inputs[1]

        customer_embedding_index = self.customer_table.lookup(user)
        article_embedding_index = self.article_table.lookup(article)

        customer_embbeding_values = self.customer_embed(customer_embedding_index)
        article_embedding_values = self.articles_embed(article_embedding_index)

        return tf.squeeze(self.dot([customer_embbeding_values, article_embedding_values]),1)
    
    def call_item_item(self, article):
        article_x = self.article_table.lookup(article)
        article_embeddings = tf.expand_dims(self.articles_embed(article_x),0)
        all_articles_embeddings = tf.expand_dims(self.articles_embed.embeddings,0)
        scores = tf.reshape(self.dot([article_embeddings, all_articles_embeddings]), [-1])

        top_scores, top_indeces = tf.math.top_k(scores, k = 100)
        top_ids = tf.gather(self.articles, top_indeces)
        return top_ids, top_scores
    def Customer_recommendation(self, customer, k):
        customer_x = self.customer_table.lookup(customer)
        customer_embeddings = tf.expand_dims(self.customer_embed(customer_x),0)
        all_articles_embeddings = tf.expand_dims(self.articles_embed.embeddings,0)
        scores = tf.reshape(self.dot([customer_embeddings, all_articles_embeddings]), [-1])

        top_scores, top_indeces = tf.math.top_k(scores, k = k)
        top_ids = tf.gather(self.articles, top_indeces)
        return top_ids, top_scores

In [27]:
class Mapper():
    def __init__(self, possible_articles, num_negative_articles):
        self.num_possible_articles = len(possible_articles)
        self.possible_articles_tensor = tf.constant(possible_articles, dtype=tf.int32)
        self.num_negative_articles = num_negative_articles
        self.y = tf.one_hot(0, num_negative_articles+1)
    def __call__(self, customer, article):
        random_negatives_indexes  = tf.random.uniform((self.num_negative_articles,),minval = 0, maxval = self.num_possible_articles , dtype = tf.int32) 
        negative_products =  tf.gather(self.possible_articles_tensor, random_negatives_indexes)
        candidates = tf.concat([article, negative_products], axis = 0)
        return (customer, candidates), self.y


In [29]:
def get_dataset(df, articles, number_negative_articles):
    dummy_customer_tensor = tf.constant(df[['customer_id']].values, dtype =tf.string)
    article_tensor = tf.constant(df[['article_id']].values,dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((dummy_customer_tensor,article_tensor))
    dataset = dataset.map(Mapper(articles, number_negative_articles)) 
    dataset = dataset.batch(1024)
    return dataset 

for (customer, candidate), y in get_dataset(train,articles_sub,4):
    print(customer)
    print(candidate)
    print(y)
    break


tf.Tensor(
[[b'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318']
 [b'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318']
 [b'00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2']
 ...
 [b'05d0c63e8a3ff46f9519e38f1af70007d474650975ef40f002a4c060a99b530d']
 [b'05da4959e37b89f92911a98e4b5a94be73a77e6d64a19440b96b05c754da115b']
 [b'05da4959e37b89f92911a98e4b5a94be73a77e6d64a19440b96b05c754da115b']], shape=(1024, 1), dtype=string)
tf.Tensor(
[[663713001 594581039 667772006 696944015 564787008]
 [541518023 711158003 612789001 832830001 806137005]
 [505221004 648254002 291957009 686720005 842056002]
 ...
 [474138001 635392003 747138002 752554001 839310001]
 [625480002 557546005 711727010 648932002 855935005]
 [522398050 598655008 612077005 609655003 508935032]], shape=(1024, 5), dtype=int32)
tf.Tensor(
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]], shape=(1024, 5), dtype=float32)


In [32]:
model = SimpleRecommender(customers_sub, articles_sub, 15)
model.compile(loss= tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
            optimizer=tf.keras.optimizers.SGD(learning_rate = 100.), 
            metrics=[tf.keras.metrics.CategoricalAccuracy()])

model.fit(get_dataset(train, articles_sub, 100), validation_data = get_dataset(valid, articles_sub,100), epochs =5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1af00524130>

In [41]:
model.Customer_recommendation(customers_sub[10000],5)

AttributeError: 'str' object has no attribute 'dtype'