### Read Me First

This notebook is to examine the main table used in the code for embedding (ratings.csv) in the Movielens example; this file is also known as the user item matrix which is important for recommendation systems

The purpose to examine the ratings table is to understand it better so that we can come up with a similar construct for restaurant menus

In [1]:
import numpy as np
import pandas as pd
import os

from pathlib import Path

In [4]:
RLRS_root_location = Path().absolute() / "../"
ratings = pd.read_csv(f'{RLRS_root_location}/data/ml-1m-csv/ratings.csv')

In [7]:
ratings.columns

Index(['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype='object')

In [18]:
ratings['pair_id'] = ratings.UserID.astype(str) +"-" + ratings.MovieID.astype(str)

In [20]:
ratings.pair_id

0             1-1193
1              1-661
2              1-914
3             1-3408
4             1-2355
             ...    
1000204    6040-1091
1000205    6040-1094
1000206     6040-562
1000207    6040-1096
1000208    6040-1097
Name: pair_id, Length: 1000209, dtype: object

In [21]:
len(ratings.pair_id.unique())

1000209

In [22]:
len(ratings)

1000209

In [23]:
len(ratings.MovieID.unique())

3706

In [24]:
len(ratings.UserID.unique())

6040

In [25]:
# average number of ratings given per user (note one rating = one item)
len(ratings)/ len(ratings.UserID.unique())

165.5975165562914

In [28]:
ratings.Rating.max()

5

In [29]:
ratings.Rating.min()

1

### READ ; below just stores my experiment with Chatgpt of how to get some useful code to generate a useable user_item matrix/ table

Below is an example from chatgpt generating 

In [32]:
user_item_interactions = {
    'user1': {'item1': 3, 'item2': 1, 'item3': 0},
    'user2': {'item1': 0, 'item2': 2, 'item3': 1},
    'user3': {'item1': 1, 'item2': 0, 'item3': 4}
}

# Initialize lists to store user IDs, item IDs, and frequencies
user_ids = []
item_ids = []
frequencies = []


for user_id, item_data in user_item_interactions.items():
    for item_id, frequency in item_data.items():
        user_ids.append(user_id)
        item_ids.append(item_id)
        frequencies.append(frequency)

# Create a pandas DataFrame from the lists
df = pd.DataFrame({
    'user_id': user_ids,
    'item_id': item_ids,
    'frequency': frequencies
})

print("User-Item Matrix (Order Frequency):")
print(df)

User-Item Matrix (Order Frequency):
  user_id item_id  frequency
0   user1   item1          3
1   user1   item2          1
2   user1   item3          0
3   user2   item1          0
4   user2   item2          2
5   user2   item3          1
6   user3   item1          1
7   user3   item2          0
8   user3   item3          4


In [33]:
import pandas as pd
from datetime import datetime

# Example data: user-item interactions with timestamps
user_item_interactions = {
    'user1': {'item1': {'frequency': 3, 'timestamp': '2024-03-15 10:30:00'},
              'item2': {'frequency': 1, 'timestamp': '2024-03-15 11:45:00'},
              'item3': {'frequency': 0, 'timestamp': None}},
    'user2': {'item1': {'frequency': 0, 'timestamp': '2024-03-15 09:00:00'},
              'item2': {'frequency': 2, 'timestamp': '2024-03-15 10:00:00'},
              'item3': {'frequency': 1, 'timestamp': '2024-03-15 12:00:00'}},
    'user3': {'item1': {'frequency': 1, 'timestamp': '2024-03-15 11:00:00'},
              'item2': {'frequency': 0, 'timestamp': None},
              'item3': {'frequency': 4, 'timestamp': '2024-03-15 13:30:00'}}
}

# Initialize lists to store user IDs, item IDs, frequencies, and timestamps
user_ids = []
item_ids = []
frequencies = []
timestamps = []

# Loop through the user-item interactions and extract data
for user_id, item_data in user_item_interactions.items():
    for item_id, interaction_data in item_data.items():
        user_ids.append(user_id)
        item_ids.append(item_id)
        frequencies.append(interaction_data['frequency'])
        timestamps.append(interaction_data['timestamp'])

# Create a pandas DataFrame from the lists
df = pd.DataFrame({
    'user_id': user_ids,
    'item_id': item_ids,
    'frequency': frequencies,
    'timestamp': timestamps
})

# Convert timestamp column to datetime type
df['timestamp'] = pd.to_datetime(df['timestamp'])

print("User-Item Matrix with Timestamps:")
print(df)


User-Item Matrix with Timestamps:
  user_id item_id  frequency           timestamp
0   user1   item1          3 2024-03-15 10:30:00
1   user1   item2          1 2024-03-15 11:45:00
2   user1   item3          0                 NaT
3   user2   item1          0 2024-03-15 09:00:00
4   user2   item2          2 2024-03-15 10:00:00
5   user2   item3          1 2024-03-15 12:00:00
6   user3   item1          1 2024-03-15 11:00:00
7   user3   item2          0                 NaT
8   user3   item3          4 2024-03-15 13:30:00


In [34]:
import pandas as pd
import numpy as np
from datetime import datetime

# Example data: user-item interactions with timestamps
user_item_interactions = {
    'user1': {'item1': {'frequency': 3, 'timestamp': '2024-03-15 10:30:00'},
              'item2': {'frequency': 1, 'timestamp': '2024-03-15 11:45:00'},
              'item3': {'frequency': 0, 'timestamp': None}},
    'user2': {'item1': {'frequency': 0, 'timestamp': '2024-03-15 09:00:00'},
              'item2': {'frequency': 2, 'timestamp': '2024-03-15 10:00:00'},
              'item3': {'frequency': 1, 'timestamp': '2024-03-15 12:00:00'}},
    'user3': {'item1': {'frequency': 1, 'timestamp': '2024-03-15 11:00:00'},
              'item2': {'frequency': 0, 'timestamp': None},
              'item3': {'frequency': 4, 'timestamp': '2024-03-15 13:30:00'}}
}

# Initialize lists to store user IDs, item IDs, and frequencies
user_ids = []
item_ids = []
frequencies = []

# Loop through the user-item interactions and extract data
for user_id, item_data in user_item_interactions.items():
    for item_id, interaction_data in item_data.items():
        user_ids.append(user_id)
        item_ids.append(item_id)
        frequencies.append(interaction_data['frequency'])

# Create a pandas DataFrame from the lists
df = pd.DataFrame({
    'user_id': user_ids,
    'item_id': item_ids,
    'frequency': frequencies
})

# Create the user-item matrix
user_item_matrix = pd.pivot_table(df, values='frequency', index='user_id', columns='item_id', fill_value=0)

# Apply SVD
U, S, Vt = np.linalg.svd(user_item_matrix)

print("Left Singular Vectors (Users):")
print(U)

print("\nSingular Values:")
print(S)

print("\nRight Singular Vectors (Items):")
print(Vt)


Left Singular Vectors (Users):
[[-0.34429397  0.92075408  0.18350363]
 [-0.29393239  0.0799174  -0.95247938]
 [-0.89166441 -0.38187056  0.24312437]]

Singular Values:
[4.41327018 2.98820697 1.89569654]

Right Singular Vectors (Items):
[[-0.43608169 -0.21121724 -0.87476856]
 [ 0.79659532  0.36161782 -0.4844259 ]
 [ 0.418651   -0.90808581  0.01055976]]


In [35]:
user_item_matrix

item_id,item1,item2,item3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
user1,3.0,1.0,0.0
user2,0.0,2.0,1.0
user3,1.0,0.0,4.0


In [36]:
import pandas as pd

# Example DataFrame with order_id and option_ids columns
data = {
    'order_id': [1, 2, 3],
    'option_ids': ['101|102|103', '201|202', '301']
}

df = pd.DataFrame(data)

# Function to split option_ids string and create new rows
def unnest_options(row):
    option_ids = row['option_ids'].split('|')
    return pd.DataFrame({'order_id': [row['order_id']] * len(option_ids),
                         'option_id': option_ids})

# Apply unnest_options function to each row and concatenate the results
new_df = pd.concat([unnest_options(row) for _, row in df.iterrows()], ignore_index=True)

print("Flattened DataFrame:")
print(new_df)


Flattened DataFrame:
   order_id option_id
0         1       101
1         1       102
2         1       103
3         2       201
4         2       202
5         3       301
