# Feature Engineering

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

user_logs = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PR/Final/user_log.csv')
user_info = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PR/Final/user_info.csv')
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PR/Final/train.csv')

In [4]:
user_logs = user_logs.rename(columns={'seller_id': 'merchant_id'})
# Merge Data Frames
df = user_logs.merge(user_info, on='user_id')

In [5]:
user_logs.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,219316,472,737,801,5890.0,1018,0
1,69247,471,1266,4614,4331.0,619,2
2,69247,471,1266,4614,4331.0,619,0
3,216952,352,991,3527,869.0,626,0
4,216952,352,991,3527,869.0,626,0


## User Behavior Features

In [6]:
# Number of unique categories the user has interacted with
unique_categories = user_logs.groupby('user_id')['cat_id'].nunique()

# Number of unique merchants the user has interacted with
unique_merchants = user_logs.groupby('user_id')['merchant_id'].nunique()

# Number of unique brands the user has interacted with
unique_brands = user_logs.groupby('user_id')['brand_id'].nunique()

# Click-to-purchase ratio for each user
clicks = user_logs[user_logs['action_type'] == 0].groupby('user_id').size()
purchases = user_logs[user_logs['action_type'] == 2].groupby('user_id').size()
click_to_purchase_ratio = purchases / clicks

# Combine the results into the user_summary DataFrame
user_summary = pd.concat([unique_categories, unique_merchants, unique_brands, click_to_purchase_ratio], axis=1)

user_summary.columns = ['unique_categories', 'unique_merchants', 'unique_brands', 'click_to_purchase_ratio']

# Reset the index and fill NaN values with 0
user_summary.reset_index(inplace=True)
user_summary.fillna(0, inplace=True)

In [7]:
user_summary.head(500)

Unnamed: 0,user_id,unique_categories,unique_merchants,unique_brands,click_to_purchase_ratio
0,313,1,1,1,0.000000
1,502,1,1,1,0.000000
2,521,1,1,1,0.000000
3,648,1,1,1,0.000000
4,740,1,1,1,0.000000
...,...,...,...,...,...
495,53998,1,1,1,0.000000
496,54272,1,1,1,0.000000
497,54559,1,1,1,0.000000
498,54779,1,1,1,0.000000


In [8]:
print(user_summary.shape)

(3929, 5)


## Merchant-Related Features

In [9]:
# Number of unique users that have interacted with the merchant
unique_users = user_logs.groupby('merchant_id')['user_id'].nunique()

# Merchant's overall click-to-purchase ratio
merchant_purchases = user_logs[user_logs['action_type'] == 2].groupby('merchant_id').size()

# Merchant's overall add-to-cart-to-purchase ratio
merchant_add_to_cart = user_logs[user_logs['action_type'] == 1].groupby('merchant_id').size()
merchant_add_to_cart_to_purchase_ratio = merchant_purchases / merchant_add_to_cart

# Merchant's overall add-to-favorite-to-purchase ratio
merchant_add_to_favorites = user_logs[user_logs['action_type'] == 3].groupby('merchant_id').size()
merchant_add_to_favorites_to_purchase_ratio = merchant_purchases / merchant_add_to_favorites

# Combine the results into a new DataFrame
merchant_summary = pd.DataFrame({'unique_users': unique_users, 
                                 'add_to_cart_to_purchase_ratio': merchant_add_to_cart_to_purchase_ratio,
                                 'add_to_favorites_to_purchase_ratio': merchant_add_to_favorites_to_purchase_ratio}).fillna(0)

# Reset the index
merchant_summary.reset_index(inplace=True)

In [10]:
merchant_summary.head(100)

Unnamed: 0,merchant_id,unique_users,add_to_cart_to_purchase_ratio,add_to_favorites_to_purchase_ratio
0,20,3,0.0,0.00
1,28,10,0.0,0.00
2,59,1,0.0,0.00
3,99,1,0.0,0.00
4,121,9,0.0,0.00
...,...,...,...,...
95,3202,5,0.0,0.00
96,3324,5,0.0,0.00
97,3334,11,0.0,0.00
98,3432,254,0.0,1.08


## Interaction Features

In [11]:
# Convert time_stamp to day of the year
user_logs['day_of_year'] = user_logs['time_stamp'].apply(lambda x: pd.to_datetime(x, format='%m%d').timetuple().tm_yday)

# Number of interactions between each user and merchant
interaction_count = user_logs.groupby(['user_id', 'merchant_id']).size().reset_index(name='interaction_count')

# Time since the last interaction between each user and merchant
last_interaction = user_logs.groupby(['user_id', 'merchant_id'])['day_of_year'].max().reset_index(name='last_interaction')

# Time since the first interaction between each user and merchant
first_interaction = user_logs.groupby(['user_id', 'merchant_id'])['day_of_year'].min().reset_index(name='first_interaction')

# Average time between interactions for each user and merchant
average_time_between_interactions = user_logs.groupby(['user_id', 'merchant_id'])['day_of_year'].apply(lambda x: x.sort_values().diff().mean()).fillna(0).reset_index(name='average_time_between_interactions')

# Number of interactions in each action type between each user and merchant
action_counts = user_logs.groupby(['user_id', 'merchant_id', 'action_type']).size().unstack(fill_value=0).reset_index().rename(columns={0: 'clicks', 1: 'add_to_cart', 2: 'purchases', 3: 'add_to_favorites'})

# Combine the results into a new DataFrame
interaction_summary = interaction_count.merge(last_interaction, on=['user_id', 'merchant_id'])
interaction_summary = interaction_summary.merge(first_interaction, on=['user_id', 'merchant_id'])
interaction_summary = interaction_summary.merge(average_time_between_interactions, on=['user_id', 'merchant_id'])
interaction_summary = interaction_summary.merge(action_counts, on=['user_id', 'merchant_id'])

# Calculate time since the last and first interaction (in days)
interaction_summary['days_since_last_interaction'] = interaction_summary['last_interaction'].max() - interaction_summary['last_interaction']
interaction_summary['days_since_first_interaction'] = interaction_summary['last_interaction'].max() - interaction_summary['first_interaction']

# Drop unnecessary columns
interaction_summary.drop(['last_interaction', 'first_interaction'], axis=1, inplace=True)

In [12]:
interaction_summary.head(100)

Unnamed: 0,user_id,merchant_id,interaction_count,average_time_between_interactions,clicks,add_to_cart,purchases,add_to_favorites,days_since_last_interaction,days_since_first_interaction
0,313,442,2,0.000000,2,0,0,0,172,172
1,502,1963,1,0.000000,1,0,0,0,4,4
2,521,801,2,0.000000,2,0,0,0,26,26
3,648,801,1,0.000000,1,0,0,0,49,49
4,740,1102,4,8.666667,3,0,0,1,113,139
...,...,...,...,...,...,...,...,...,...,...
95,11572,3432,2,0.000000,2,0,0,0,1,1
96,11791,651,2,2.000000,2,0,0,0,0,2
97,11852,1713,1,0.000000,1,0,0,0,173,173
98,11952,651,7,0.833333,7,0,0,0,0,5


## Demographic Features

In [30]:
# Create a dictionary to map age_range to its corresponding age group
age_range_map = {1: '<18', 2: '18-24', 3: '25-29', 4: '30-34', 5: '35-39', 6: '40-49', 7: '>50', 8: '>50', 0: 'unknown'}

# Create a dictionary to map gender to its corresponding gender group
gender_map = {0: 'Female', 1: 'Male', 2: 'Unknown'}

# Combine age_range and gender into a new categorical feature
user_info['demographic_group'] = user_info['age_range'].map(age_range_map) + '_' + user_info['gender'].map(gender_map)

# Merge user_info with user_logs
user_logs_demographic = user_logs.merge(user_info[['user_id', 'age_range', 'gender']], on='user_id')

# Calculate the number of interactions for each demographic group per merchant
demographic_interactions = user_logs_demographic.groupby('user_id').size().reset_index(name='age_range_to_gender_interaction_count')

In [31]:
demographic_interactions.head(100)

Unnamed: 0,user_id,age_range_to_gender_interaction_count
0,313,1
1,502,1
2,521,1
3,648,1
4,740,16
...,...,...
95,11572,1
96,11791,4
97,11852,1
98,11952,4


## Temporal Features

In [15]:
import datetime

def extract_day_of_week(time_stamp):
    time_stamp_str = str(time_stamp).zfill(4)
    month, day = int(time_stamp_str[:2]), int(time_stamp_str[2:])
    return datetime.datetime(year=1900, month=month, day=day).weekday()

def extract_month(time_stamp):
    time_stamp_str = str(time_stamp).zfill(4)
    return int(time_stamp_str[:2])

user_logs['day_of_week'] = user_logs['time_stamp'].apply(extract_day_of_week)
user_logs['month'] = user_logs['time_stamp'].apply(extract_month)

# Calculate the number of interactions (click, add-to-cart, purchase, add-to-favorite) by each user per day of the week and month.
user_interactions_day_of_week = user_logs.groupby(['user_id', 'day_of_week', 'action_type']).size().unstack(fill_value=0).reset_index()
user_interactions_month = user_logs.groupby(['user_id', 'month', 'action_type']).size().unstack(fill_value=0).reset_index()

In [16]:
user_interactions_day_of_week.head(100)

action_type,user_id,day_of_week,0,1,2,3
0,313,2,2,0,0,0
1,502,2,1,0,0,0
2,521,1,2,0,0,0
3,648,6,1,0,0,0
4,740,0,1,0,0,0
...,...,...,...,...,...,...
95,10094,1,1,0,0,0
96,10160,1,2,0,0,1
97,10160,2,1,0,0,0
98,10234,6,1,0,0,0


In [37]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PR/Final/train.csv')
user_logs = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PR/Final/user_log.csv')
user_logs = user_logs.rename(columns={'seller_id': 'merchant_id'})

pd.set_option('display.max_columns', 31)

user_logs = user_logs.merge(user_summary, on='user_id', how='left')
user_logs = user_logs.merge(user_info, on='user_id', how='left')
user_logs = user_logs.merge(merchant_summary, on='merchant_id', how='left')
user_logs = user_logs.merge(interaction_summary, on=['user_id', 'merchant_id'], how='left')
user_logs = user_logs.merge(user_interactions_day_of_week, on='user_id', how='left')
user_logs = user_logs.merge(demographic_interactions, on='user_id', how='left')
user_logs = user_logs.drop('demographic_group', axis=1)
train = train.drop('merchant_id', axis=1)
user_logs = user_logs.merge(train, on='user_id', how='left')
user_logs = user_logs.drop_duplicates()
user_logs.fillna(0, inplace=True)

In [38]:
print(user_logs.head(10))

    user_id  item_id  cat_id  merchant_id  brand_id  time_stamp  action_type  \
0    219316      472     737          801    5890.0        1018            0   
1     69247      471    1266         4614    4331.0         619            2   
3     69247      471    1266         4614    4331.0         619            0   
5    216952      352     991         3527     869.0         626            0   
8     55440      432    1228         1365    2297.0         527            0   
10   265624      428     946         1963    6109.0        1019            0   
11    53257      467     151         1056    1439.0        1111            0   
12   211279      362      36         3334    6352.0        1030            0   
13   218826      467     151         1056    1439.0         618            0   
14   218826      467     151         1056    1439.0         618            0   

    unique_categories  unique_merchants  unique_brands  \
0                   1                 1              1   
1  

In [36]:
print(user_logs.shape)

(8016, 31)


In [39]:
# Convert the DataFrame to a CSV file
user_logs.to_csv('updated_train.csv', index=False)