In [1]:
import numpy as np
import pandas as pd
import pickle
from tqdm.notebook import tqdm
tqdm.pandas()

from pathlib import Path

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

# import recmetrics

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
main_dir = "../input/h-and-m-personalized-fashion-recommendations"
images_dir = main_dir+"/images/" 
customers = pd.read_csv(main_dir+"/customers.csv")
articles = pd.read_csv(main_dir+"/articles.csv", dtype={'article_id': str})
sample_submission = pd.read_csv(main_dir+"/sample_submission.csv", dtype={'article_id': str})

train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',  
                    dtype={'article_id': str}, parse_dates=['t_dat'],
                   low_memory=True)

In [3]:
train['article_id'].nunique(), train['customer_id'].nunique()

(104547, 1362281)

In [4]:
top_customers = train['customer_id'].value_counts()[train['customer_id'].value_counts()>200].index.tolist()
# top_customers = train['customer_id'].value_counts()[train['customer_id'].value_counts()>50].index.tolist()

len(top_customers)

11293

In [5]:
df = train[train['customer_id'].isin(top_customers)]
df.shape

(3248790, 5)

In [6]:
train.shape

(31788324, 5)

In [7]:
print(train.isna().sum()[train.isna().sum()>0],
customers.isna().sum()[customers.isna().sum()>0],
articles.isna().sum()[articles.isna().sum()>0], sep='\n')

Series([], dtype: int64)
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16009
age                        15861
dtype: int64
detail_desc    416
dtype: int64


In [8]:
customers.columns

Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')

In [9]:
customers['Active'].value_counts()

1.0    464404
Name: Active, dtype: int64

In [10]:
sample_submission

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0372860001 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0372860001 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...


In [11]:
train.shape, customers.shape, articles.shape

((31788324, 5), (1371980, 7), (105542, 25))

In [12]:
train.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2


In [13]:
customers.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...


In [14]:
customers['postal_code'].value_counts()

2c29ae653a9282cce4151bd87643c907644e09541abc28ae87dea0d1f6603b1c    120303
cc4ed85e30f4977dae47662ddc468cd2eec11472de6fac5ec985080fd92243c8       261
714976379549eb90aae4a71bca6c7402cc646ae7c40f6c1cb91d4b5a18623fc1       159
7c1fa3b0ec1d37ce2c3f34f63bd792f3b4494f324b6be5d1e4ba6a75456b96a7       157
5b7eb31eabebd3277de632b82267286d847fd5d44287ee150bb4206b48439145       156
                                                                     ...  
16dd7e391f305b54c01ffed87c189e33de83158d0a808d1b346222703742f638         1
5ce1c699380b3c13655347c9636043f5e64ec2538a1ee7d42dd253a584577630         1
afdedc642d01efeefec6fc8d3394724a71888ebd0ff0311eff6f34946589479d         1
4c8838d278d23029c920b9684fa4cf30a532428a445afb300eeb288685cf00e1         1
0a1a03306fb2f62164c2a439b38c0caa64b40deaae868799ccb93ac69444b79d         1
Name: postal_code, Length: 352899, dtype: int64

In [15]:
articles.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [16]:
articles['product_code'].value_counts()

783707    75
684021    70
699923    52
699755    49
685604    46
          ..
761368     1
761366     1
761360     1
761350     1
959461     1
Name: product_code, Length: 47224, dtype: int64

In [17]:
articles['prod_name'].value_counts()

Dragonfly dress                98
Mike tee                       72
Wow printed tee 6.99           70
1pk Fun                        55
TP Paddington Sweater          54
                               ..
W MARCIE DRESS CNY              1
W NAPOLI SKIRT CNY              1
BEANIE JERSEY FLEECED LINED     1
H-string multicolour            1
Lounge dress                    1
Name: prod_name, Length: 45875, dtype: int64

In [18]:
articles.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [19]:
articles.isna().sum()[articles.isna().sum()>0]

detail_desc    416
dtype: int64

In [20]:
articles['article_id'].nunique(), articles['product_code'].nunique()

(105542, 47224)

In [21]:
train['sales_channel_id'].value_counts()

2    22379862
1     9408462
Name: sales_channel_id, dtype: int64

In [22]:
customers['age'].fillna(value=-1, inplace=True)

In [23]:
df['price'].describe()

count    3.248790e+06
mean     3.006132e-02
std      2.080211e-02
min      1.525424e-04
25%      1.693220e-02
50%      2.540678e-02
75%      3.550847e-02
max      5.915254e-01
Name: price, dtype: float64

In [24]:
(df['price']*590).describe()

count    3.248790e+06
mean     1.773618e+01
std      1.227325e+01
min      9.000000e-02
25%      9.990000e+00
50%      1.499000e+01
75%      2.095000e+01
max      3.490000e+02
Name: price, dtype: float64

In [25]:
((df['price']*590).astype(int)).describe()

count    3.248790e+06
mean     1.686325e+01
std      1.225915e+01
min      0.000000e+00
25%      9.000000e+00
50%      1.400000e+01
75%      2.000000e+01
max      3.490000e+02
Name: price, dtype: float64

In [26]:
df.reset_index(drop=True, inplace=True)
df['price'] = (df['price']*590).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
class DataProc():
    def __init__(self, interaction_df, client_df, product_df, filter=None, test_size=0.1):
        self.interaction_df = interaction_df[interaction_df['customer_id'].isin(client_df['customer_id'].tolist())]
        self.client_df = client_df
        self.product_df = product_df   
        self.test_size = test_size

    def split_data(self, finalize=False):
        sales = self.interaction_df
        sales = sales[sales['price']>0].copy()
        sales = sales.sort_values(by='t_dat').reset_index(drop=True).copy()
        if finalize==True:
            train_df = sales
            test_df = sales
        else:
            test_df = sales.tail(int(len(sales)*self.test_size))
            train_df = sales.head(len(sales)-len(test_df))
        df = train_df[['customer_id', 'article_id', 'price']].groupby(['customer_id', 'article_id']).agg('sum')
        test = test_df[['customer_id', 'article_id', 'price']].groupby(['customer_id', 'article_id']).agg('sum')
        df.reset_index(level=0, inplace=True)
        df.reset_index(level=0, inplace=True)
        test.reset_index(level=0, inplace=True)
        test.reset_index(level=0, inplace=True)
        df.rename(columns={'article_id': 'itemID', 
                            'customer_id': 'userID', 
                            'price': 'count'}, inplace=True)
        test.rename(columns={'article_id': 'itemID', 
                            'customer_id': 'userID', 
                            'price': 'count'}, inplace=True)
        if type(filter) is list:
            df = df[~df['itemID'].isin(filter)].copy()
        print('TRAIN SHAPE:', train_df.shape)
        print('TEST SHAPE:', test_df.shape)
        
        return df, test

    def preprocess_interaction(self, df):


        # add metadata
        df = df.merge(self.product_df[['article_id', 'colour_group_code', 'product_type_no', 'product_code', 
                                       'section_no', 'department_no', 'graphical_appearance_no', 'index_group_no',
                                       'garment_group_no']],
                how='left',
                left_on='itemID',
                right_on='article_id').merge(self.client_df[['customer_id', 'age']],
                                                    how='left', 
                                                    left_on='userID', 
                                                    right_on='customer_id').copy()
        _df = df.drop(['article_id', 'customer_id'], axis=1)

        # swap columns cuz LightFM needs this format
        cols = list(_df.columns)
        cols.remove('userID')
        _df = _df[['userID', *cols]].copy()

        _df.dropna(inplace=True)
        
        # smooth count
        _df[['count']]=_df[['count']].applymap(lambda x: np.log2(x)+1)
        return _df

    def get_user_item_dic(self):
        # convert userID & itemID to id
        id_to_user = dict(enumerate(self.client_df['customer_id'].unique()))
        id_to_item = dict(enumerate(self.product_df['article_id'].unique()))
        user_to_id = dict((v,k) for k,v in id_to_user.items())
        item_to_id = dict((v,k) for k,v in id_to_item.items())
        return {'id_to_user': id_to_user,
                'id_to_item': id_to_item,
                'user_to_id': user_to_id,
                'item_to_id': item_to_id}
    
    def create_dataset(self):
        dataset = Dataset()
        
        item_features = self.product_df['colour_group_code'].unique().tolist()+self.product_df['product_type_no'].unique().tolist()\
        +self.product_df['product_code'].unique().tolist()+self.product_df['section_no'].unique().tolist()\
        +self.product_df['department_no'].unique().tolist()+self.product_df['graphical_appearance_no'].unique().tolist()\
        +self.product_df['index_group_no'].unique().tolist()+self.product_df['garment_group_no'].unique().tolist()

        dataset.fit(users=self.client_df['customer_id'].unique(), 
                    items=self.product_df['article_id'].unique(), 
                    user_features=self.client_df['age'].unique().tolist()+\
                    self.client_df['postal_code'].unique().tolist(),
                    item_features= item_features,
#                     item_features=self.product_df['colour_group_code'].unique().tolist(),
                   )
        
        return dataset

    def create_sparse_matrix(self, df):       
#         return self.create_dataset().build_interactions(df.values)
#         return self.create_dataset().build_interactions(df.iloc[:, 0:3].values)
        return self.create_dataset().build_interactions(df.iloc[:, 0:2].values)



    def create_user_item_features(self):
        item_features = self.create_dataset().build_item_features(
            (x, y) for x,y in zip(self.product_df['article_id'], 
                                  self.product_df[['colour_group_code', 'product_type_no', 'product_code', 
                                       'section_no', 'department_no', 'graphical_appearance_no', 'index_group_no',
                                       'garment_group_no']].values))
#             (x, y) for x,y in zip(self.product_df['article_id'], self.product_df[['colour_group_code']].values))
            

        user_features = self.create_dataset().build_user_features(
#             (x, y) for x,y in zip(self.client_df['customer_id'], self.client_df[['age']].values))
            
            (x, y) for x,y in zip(self.client_df['customer_id'], self.client_df[['age', 'postal_code']].values))
        
        return {'item_features': item_features, 'user_features': user_features}

    def process_data(self, finalize=False):
        train_df, test_df = self.split_data(finalize=finalize)
        df = self.preprocess_interaction(train_df)
        dic = self.get_user_item_dic()
        spm, weights = self.create_sparse_matrix(df)
        features = self.create_user_item_features()
        result = {'interaction': spm,
                'weights': weights,
                'test_df': test_df,
                **dic,
                **features}
        print(f'Use {list(result.keys())} keys to access data')
        return result
    
class RecModel():
    def __init__(self, loss, lr, n_component, n_epoch, n_thread, i_alpha, u_alpha, seed=42, verbose=True):
        self.loss = loss
        self.lr = lr
        # no of latent factors
        self.n_component = n_component
        self.n_epoch = n_epoch
        # no of threads to fit model
        self.n_thread = n_thread
        # regularisation for both user and item features
        self.i_alpha = i_alpha
        self.u_alpha = u_alpha
        self.seed = seed
        self.verbose = verbose
#         self.n=n,
#         self.k=k,
#         self.max_sampled = max_sampled,

        self.model = LightFM(loss=self.loss, 
                             no_components=self.n_component, 
                             learning_rate=self.lr, 
                             item_alpha = self.i_alpha,
                             user_alpha = self.u_alpha,
#                              max_sampled = self.max_sampled,
#                              n = self.n,
#                              k = self.k,
#                              learning_schedule = self.learning_schedule,
#                              epsilon = self.epsilon,
#                              rho = self.rho,
                             random_state=np.random.RandomState(self.seed),
                            )
        
    def summary(self):
        return pd.DataFrame({'Loss': self.loss,
                             'Learning Rate': self.lr,
                             'Latent Factors': self.n_component,
                             'Epochs': self.n_epoch,
                             'Threads': self.n_thread,
                             'Item Regularisation': self.i_alpha,
                             'User Regularisation': self.u_alpha,
                             'Seed': self.seed,
                             'Verbose': str(self.verbose)}, index=['Detail']).transpose()
        
    def fit_model(self, interactions, weights, item_features, user_features):
        md = self.model
        md.fit(interactions=interactions,
                         epochs=self.n_epoch, 
                         num_threads = self.n_thread,
                         item_features=item_features,
                         user_features=user_features,
                         sample_weight=weights,
                         verbose=self.verbose)
          
        return md

    def recommend_with_user_id(self, model, user_id, user_to_id, item_features, user_features, 
                               id_to_item, n_items=90000, k=12):
        
        pred = pd.Series(model.predict(user_ids=user_to_id[user_id],
                item_ids=np.arange(n_items),
                item_features=item_features,
                user_features=user_features)).sort_values(ascending=False).head(k).index.tolist()
        return [id_to_item[id] for id in pred]

        
    def get_scores(self, test_df, model, user_to_id, id_to_item, item_features, 
                   user_features, calculate_personalization=False, k=12):
        true_vals = []
        preds = []
        for i in tqdm(range(len(test_df['userID'].unique()))):
            user = test_df['userID'].unique()[i]
            true_vals.append(test_df[test_df['userID']==user]['itemID'].tolist())
            preds.append(self.recommend_with_user_id(model=model, 
                                                    user_id=user, 
                                                    user_to_id=user_to_id, 
                                                    item_features=item_features, 
                                                    user_features=user_features, 
                                                    id_to_item=id_to_item, 
                                                    k=k))
        precision = recmetrics.recommender_precision(predicted=preds, actual=true_vals)
        recall = recmetrics.recommender_recall(predicted=preds, actual=true_vals)
        mark = recmetrics.mark(predicted=preds, actual=true_vals, k=k)
        catalog = list(id_to_item.values())
        coverage = recmetrics.prediction_coverage(predicted=preds, catalog=catalog)
        result = pd.DataFrame(columns=['precision', 'recall', 'mark', 'coverage'], 
                                index=[f'{model.loss.upper()} Loss'])
        result['precision'] = precision
        result['recall'] = recall
        result['mark'] = mark
        result['coverage'] = coverage
        if calculate_personalization:
            personalization = recmetrics.personalization(predicted=preds)
            result['personalization'] = personalization
        return result

In [28]:
dp = DataProc(df, customers, articles)
dp

<__main__.DataProc at 0x7fbc56df1c50>

In [29]:
%%time
result = dp.process_data(finalize=True)


TRAIN SHAPE: (3244558, 5)
TEST SHAPE: (3244558, 5)
Use ['interaction', 'weights', 'test_df', 'id_to_user', 'id_to_item', 'user_to_id', 'item_to_id', 'item_features', 'user_features'] keys to access data
CPU times: user 56.6 s, sys: 2.12 s, total: 58.7 s
Wall time: 58.7 s


In [30]:
# result['test_df']

In [31]:
# a = pd.DataFrame(result['item_features'])
# a

In [32]:
# print(len(a.iloc[0].values))
# print(a.iloc[0])
# print(a.iloc[1])

In [33]:
# b = pd.DataFrame(result['user_features'])
# b

In [34]:
rm = RecModel(loss='warp', 
              lr=0.05, 
              n_component=50, 
              n_epoch=200, #40
              n_thread=32, 
              i_alpha=1e-5, 
              u_alpha=1e-5,
#               n=10,
#               k=500,
#               max_sampled=5,
#               learning_schedule='adadelta',
#               epsilon=0.08,
#               rho=0.85,
             )

rm.summary()

Unnamed: 0,Detail
Loss,warp
Learning Rate,0.05
Latent Factors,50
Epochs,200
Threads,32
Item Regularisation,0.00001
User Regularisation,0.00001
Seed,42
Verbose,True


In [35]:
model = rm.fit_model(interactions=result['interaction'],
                     weights=result['weights'],
                     item_features=result['item_features'],
                     user_features=result['user_features'],
                    )

Epoch: 100%|██████████| 200/200 [48:43<00:00, 14.62s/it]


In [36]:
def rec(x):
    pred = rm.recommend_with_user_id(model=model, 
                                     user_id=x, 
                                     user_to_id=result['user_to_id'], 
                                     item_features=result['item_features'], 
                                     user_features=result['user_features'], 
                                     id_to_item=result['id_to_item'])
    return ' '.join(i for i in pred)

In [37]:
pred_df = sample_submission.copy()
pred_df['prediction'] = pred_df['customer_id'].progress_apply(lambda x: rec(x))

  0%|          | 0/1371980 [00:00<?, ?it/s]

In [38]:
pred_df.to_csv('submission.csv', index=False)

In [39]:
pred_df.isna().sum()

customer_id    0
prediction     0
dtype: int64

In [40]:
import joblib
joblib.dump(model, 'rec_model.joblib')

['rec_model.joblib']

In [41]:
# lightfm.evaluation.precision_at_k(model, test_interactions, train_interactions=None, k=10, user_features=None, item_features=None, preserve_rows=False, num_threads=1, check_intersections=True)