In [1]:
#Code by Morris Zieve

import pandas as pd

user_logs = pd.read_csv('Dataset/user_log.csv')
user_info = pd.read_csv('Dataset/user_info.csv')
train = pd.read_csv('Dataset/train.csv')

In [2]:
# Merge Data Frames
df = user_logs.merge(user_info, on='user_id')

In [3]:
train.head()

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0


In [4]:
# Total Number of Purchases
user_actions = df.groupby(['user_id', 'action_type']).size().unstack(fill_value=0)
user_actions.columns = ['clicks', 'add_to_cart', 'purchases', 'add_to_favorite']
user_actions.drop(columns=['clicks', 'add_to_cart', 'add_to_favorite'], inplace=True)

print(user_actions)

         purchases
user_id           
313              0
502              0
521              0
648              0
740              0
...            ...
423539           0
423630           0
424044           0
424102           0
424116           1

[3929 rows x 1 columns]


In [5]:
# User's favorite merchant, category, and brand
favorite_merchant = user_logs.groupby('user_id')['seller_id'].agg(lambda x: x.value_counts().index[0]).reset_index(name='favorite_merchant')
favorite_category = user_logs.groupby('user_id')['cat_id'].agg(lambda x: x.value_counts().index[0]).reset_index(name='favorite_category')

def get_favorite_brand(x):
    if x.value_counts().size == 0:
        return None
    return x.value_counts().index[0]

favorite_brand = user_logs.groupby('user_id')['brand_id'].agg(get_favorite_brand).reset_index(name='favorite_brand')

In [6]:
favorite_brand.head()

Unnamed: 0,user_id,favorite_brand
0,313,7090.0
1,502,6109.0
2,521,5890.0
3,648,5890.0
4,740,1214.0


In [7]:
# Average Interactions Per Merchant
average_interactions_per_merchant = user_logs.groupby('seller_id')['action_type'].count() / user_logs['seller_id'].nunique()

average_interactions_per_merchant.head(50)

#Investigate further.

seller_id
20       0.020548
28       0.102740
59       0.006849
99       0.006849
121      0.082192
152      0.089041
158      1.102740
167      0.102740
191      0.226027
215      0.027397
235      0.342466
262      0.239726
332      0.034247
333      0.006849
339      0.171233
442      0.417808
492      0.013699
501      0.246575
548      0.054795
628      0.006849
641      0.116438
651      3.534247
669      0.034247
677      0.020548
727      0.726027
776      0.006849
778      0.006849
795      0.041096
801     16.904110
808      0.109589
823      0.212329
830      0.123288
833      0.006849
849      0.013699
852      0.041096
900      0.006849
957      0.876712
962      0.205479
995      0.047945
1056     2.863014
1102     0.760274
1129     0.321918
1147     0.020548
1162     0.061644
1214     0.006849
1218     0.061644
1221     0.239726
1241     0.006849
1294     0.034247
1350     0.123288
Name: action_type, dtype: float64

In [9]:
# User Category Interaction Features: This involves creating features based on the user's interaction with the merchant, 
# such as the number of clicks, number of times added to cart, and number of purchases made from a particular merchant.

# count the total number of interactions between each user and 
total_merchant_interactions = user_logs.groupby(['user_id', 'seller_id', 'action_type']).size().reset_index(name='merchant_interactions')

# count the number of times each user has performed each action type for each merchant
counts = df.groupby(['user_id', 'seller_id', 'action_type']).size().reset_index().pivot_table(index=['user_id', 'seller_id'], columns='action_type', values=0, fill_value=0)
counts = counts.rename(columns={0: 'clicks', 1: 'add_to_cart', 2: 'purchases', 3: 'add_to_favourites'})

# merge the counts with the total interactions data
user_merchant = total_merchant_interactions.merge(counts, on=['user_id', 'seller_id'])

# display the results
user_merchant.head(50)
print(user_merchant.shape)

(4692, 8)


In [10]:
# Unqiue Item, Category, and Brand Interactions
unique_items = user_logs.groupby('user_id')['item_id'].nunique().reset_index(name='unique_items_interacted')
unique_categories = user_logs.groupby('user_id')['cat_id'].nunique().reset_index(name='unique_categories_interacted')
unique_brands = user_logs.groupby('user_id')['brand_id'].nunique().reset_index(name='unique_brands_interacted')

unique_interactions = unique_items
unique_interactions = unique_interactions.merge(unique_categories, on='user_id')
unique_interactions = unique_interactions.merge(unique_brands, on='user_id')

unique_interactions.head(150)

Unnamed: 0,user_id,unique_items_interacted,unique_categories_interacted,unique_brands_interacted
0,313,1,1,1
1,502,1,1,1
2,521,1,1,1
3,648,1,1,1
4,740,1,1,1
...,...,...,...,...
145,17280,1,1,1
146,17404,1,1,1
147,17496,1,1,1
148,17766,1,1,1


In [13]:
# Days Since First Interaction
user_logs['time_stamp'] = pd.to_datetime(user_logs['time_stamp'], format='%m%d')
first_interaction = user_logs.groupby('user_id')['time_stamp'].min().reset_index(name='first_interaction')
#first_seller = user_logs.groupby('user_id')['time_stamp'].min()
max_date = user_logs['time_stamp'].max()
first_interaction['days_since_first_interaction'] = (max_date - first_interaction['first_interaction']).dt.days

first_interaction.head(50)

# Insert info about which merchant this first interaction was with, so that we know if the user would be a repeat buyer of that merchant.

Unnamed: 0,user_id,first_interaction,days_since_first_interaction
0,313,1900-05-23,172
1,502,1900-11-07,4
2,521,1900-10-16,26
3,648,1900-09-23,49
4,740,1900-06-25,139
5,768,1900-09-13,59
6,776,1900-11-11,0
7,823,1900-09-22,50
8,997,1900-08-29,74
9,1045,1900-09-25,47


# Demographic Features

This involves creating features based on the user's demographic information, such as age range and gender. This can help identify patterns in user behavior based on their demographic characteristics.

Merchant: male to female ratio, age to merchant, combine age and gender per merchant. 

In [14]:
# create age group feature
user_info['age_group'] = pd.cut(user_info['age_range'], bins=[0, 18, 24, 39, float('inf')], labels=['young', 'young adult', 'adult', 'senior'])

# create gender feature
user_info['is_female'] = (user_info['gender'] == 0).astype(int)

# create age and gender interaction feature
user_info['female_young_adult'] = ((user_info['gender'] == 0) & (user_info['age_range'] == 2)).astype(int)
user_info['male_adult'] = ((user_info['gender'] == 1) & (user_info['age_range'] == 4)).astype(int)

# create age range frequency feature
age_range_frequency = user_info['age_range'].value_counts().to_dict()
user_info['age_range_frequency'] = user_info['age_range'].map(age_range_frequency)

# create gender frequency feature
gender_frequency = user_info['gender'].value_counts().to_dict()
user_info['gender_frequency'] = user_info['gender'].map(gender_frequency)

# create age and gender frequency feature
age_gender_frequency = user_info.groupby(['age_range', 'gender']).size().reset_index(name='frequency')
user_profile = pd.merge(user_info, age_gender_frequency, on=['age_range', 'gender'], how='left')

user_info.head(50)

Unnamed: 0,user_id,age_range,gender,age_group,is_female,female_young_adult,male_adult,age_range_frequency,gender_frequency
0,376517,6.0,1.0,young,0,0,0,35464.0,121670.0
1,234512,5.0,0.0,young,1,0,0,40777.0,285638.0
2,344532,5.0,0.0,young,1,0,0,40777.0,285638.0
3,186135,5.0,0.0,young,1,0,0,40777.0,285638.0
4,30230,5.0,0.0,young,1,0,0,40777.0,285638.0
5,272389,6.0,1.0,young,0,0,0,35464.0,121670.0
6,281071,4.0,0.0,young,1,0,0,79991.0,285638.0
7,139859,7.0,0.0,young,1,0,0,6992.0,285638.0
8,198411,5.0,1.0,young,0,0,0,40777.0,121670.0
9,67037,4.0,1.0,young,0,0,1,79991.0,121670.0
