In [15]:
import pandas as pd

path = "recommender-system-2022-challenge-polimi-data/interactions_and_impressions.csv"

URM_all = pd.read_csv(filepath_or_buffer=path,
                                           sep=",",
                                           header=1,
                                           engine='python',
                                           names=['UserId', 'ItemId', 'ImpressionList', 'Data'])

content_type = pd.read_csv("recommender-system-2022-challenge-polimi-data/data_ICM_type.csv")
content_length = pd.read_csv("recommender-system-2022-challenge-polimi-data/data_ICM_length.csv")

In [16]:
URM_all.head(15)

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,21,,0
1,0,21,,0
2,0,21,20212223242526272829,0
3,0,21,,1
4,0,21,,1
5,0,21,,1
6,0,21,,1
7,0,21,,1
8,0,21,,1
9,0,21,,1


In [17]:
content_type

Unnamed: 0,item_id,feature_id,data
0,0,1,1
1,1,3,1
2,2,4,1
3,3,1,1
4,4,3,1
...,...,...,...
23086,27963,1,1
23087,27964,2,1
23088,27965,1,1
23089,27966,1,1


In [18]:
content_length

Unnamed: 0,item_id,feature_id,data
0,0,0,1
1,1,0,1
2,2,0,21
3,3,0,1
4,4,0,1
...,...,...,...
23086,27963,0,1
23087,27964,0,1
23088,27965,0,1
23089,27966,0,1


In [19]:
print("Unique items in ICM_length are {}".format(len(content_length.item_id.unique())))
print("Unique items in ICM_content are {}".format(len(content_type.item_id.unique())))

content_type = content_type.drop(["data"], axis=1) # drop the data column on type
content_length = content_length.drop(["feature_id"], axis=1) # drop the featureId column on type

ICM_all = content_type.merge(content_length, on="item_id", how="left")
ICM_all.rename(columns={'data': 'length', "feature_id": "content_type"}, inplace=True)


Unique items in ICM_length are 23091
Unique items in ICM_content are 23091


In [20]:
ICM_all

Unnamed: 0,item_id,content_type,length
0,0,1,1
1,1,3,1
2,2,4,21
3,3,1,1
4,4,3,1
...,...,...,...
23086,27963,1,1
23087,27964,2,1
23088,27965,1,1
23089,27966,1,1


## Mapping

In [21]:
# UserIDs mapping
mapped_id, original_id = pd.factorize(URM_all["UserId"].unique())
user_original_Id_to_index = pd.Series(mapped_id, index=original_id)
user_original_Id_to_index   # remains the same

0            0
1            1
2            2
3            3
4            4
         ...  
41624    41624
41625    41625
41626    41626
41627    41627
41628    41628
Length: 41629, dtype: int64

In [22]:
# ItemIDs mapping
mapped_id, original_id = pd.factorize(ICM_all.item_id.unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)
item_original_ID_to_index

0            0
1            1
2            2
3            3
4            4
         ...  
27963    23086
27964    23087
27965    23088
27966    23089
27967    23090
Length: 23091, dtype: int64

In [23]:
URM_all["UserId"] = URM_all["UserId"].map(user_original_Id_to_index)
URM_all

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,21,,0
1,0,21,,0
2,0,21,20212223242526272829,0
3,0,21,,1
4,0,21,,1
...,...,...,...,...
5826500,41628,20448,,0
5826501,41628,20896,,1
5826502,41628,21506,,1
5826503,41628,22882,,0


In [24]:
URM_all["ItemId"] = URM_all["ItemId"].map(item_original_ID_to_index)
URM_all

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,19.0,,0
1,0,19.0,,0
2,0,19.0,20212223242526272829,0
3,0,19.0,,1
4,0,19.0,,1
...,...,...,...,...
5826500,41628,16364.0,,0
5826501,41628,,,1
5826502,41628,,,1
5826503,41628,18304.0,,0


In [25]:
ICM_all_mapped = ICM_all.copy()
ICM_all_mapped["item_id"] = ICM_all_mapped["item_id"].map(item_original_ID_to_index)
ICM_all_mapped

Unnamed: 0,item_id,content_type,length
0,0,1,1
1,1,3,1
2,2,4,21
3,3,1,1
4,4,3,1
...,...,...,...
23086,23086,1,1
23087,23087,2,1
23088,23088,1,1
23089,23089,1,1


#### Create a submission

In [26]:
submission_df = pd.DataFrame({"user_id": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], "item_id": [34, 55, 77, 1, 43, 43, 9, 1888, 90, 11111]})
submission_df

Unnamed: 0,user_id,item_id
0,0,34
1,0,55
2,0,77
3,0,1
4,0,43
5,1,43
6,1,9
7,1,1888
8,1,90
9,1,11111


In [27]:
rich_df = submission_df.merge(ICM_all_mapped, how="left")
rich_df

Unnamed: 0,user_id,item_id,content_type,length
0,0,34,3,1
1,0,55,1,1
2,0,77,1,1
3,0,1,3,1
4,0,43,4,9
5,1,43,4,9
6,1,9,1,3
7,1,1888,1,1
8,1,90,1,1
9,1,11111,4,33


In [36]:
new_item_id = 1888
old_item_id = item_original_ID_to_index[item_original_ID_to_index==new_item_id].index.item()
print("ItemId which is now {}, was once {}".format(new_item_id, old_item_id))
print("Current ItemId {}, in the ICM has length {} and type {}".format(new_item_id, ICM_all[ICM_all["item_id"]==old_item_id].length.item(), ICM_all[ICM_all["item_id"]==old_item_id].content_type.item()))

ItemId which is now 1888, was once 2355
Current ItemId 1888, in the ICM has length 1 and type 1


#### Then get the old IDs as before

-----------------