In [103]:
import pandas as pd

# Load the item data
item_data_path = 'data/sushi3-2016/sushi3.idata'
item_columns = ['item_id', 'name', 'style', 'major_group', 'minor_group', 'heaviness', 'consumption_frequency', 'normalized_price', 'sell_frequency']
item_df = pd.read_csv(item_data_path, sep='\t', header=None, names=item_columns)

# Filter the item data to include only the 10 items used in the paper
item_set_A_ids = [0, 1, 2, 3, 4, 6, 7, 8, 26, 29]
#item_set_A_ids=[i for i in range(100)]
item_set_A_df = item_df[item_df['item_id'].isin(item_set_A_ids)]

# Preprocess the item features
categorical_features = ['style', 'major_group', 'minor_group']
numerical_features = ['heaviness', 'consumption_frequency', 'normalized_price', 'sell_frequency']

# Convert categorical features to strings to ensure get_dummies works correctly

item_features = pd.get_dummies(item_set_A_df[categorical_features].astype(str))
item_features = pd.concat([item_features, item_set_A_df[numerical_features]], axis=1)

# Display the preprocessed item features
print("Item features shape:", item_features.shape)

# Load the user data
user_data_path = 'data/sushi3-2016/sushi3.udata'
user_columns = ['user_id', 'gender', 'age', 'total_time', 'prefecture_longest', 'region_longest', 'east_west_longest', 'prefecture_current', 'region_current', 'east_west_current', 'prefecture_diff']
user_df = pd.read_csv(user_data_path, sep='\t', header=None, names=user_columns)

# Preprocess the user features
categorical_features_user = ['gender', 'age', 'prefecture_longest', 'region_longest', 'east_west_longest', 'prefecture_current', 'region_current', 'east_west_current']

# Convert categorical features to strings to ensure get_dummies works correctly
user_df[categorical_features_user] = user_df[categorical_features_user].astype(str)

user_features = pd.get_dummies(user_df[categorical_features_user])

# Display the preprocessed user features
print("User features shape:", user_features.shape)


Item features shape: (10, 16)
User features shape: (5000, 128)


# Carregando o arquivo de preferencias e removendo as duas primeiras colunas de metadados

In [102]:
import pandas as pd

# Load and parse the preference order data manually
preference_data_path = 'data/sushi3-2016/sushi3a.5000.10.order'
with open(preference_data_path, 'r') as file:
    lines = file.readlines()

# Remove the first row which contains metadata
lines = lines[1:]

# Split each line into a list of preferences
preference_data = [line.strip().split() for line in lines]

# Convert to a DataFrame
preference_df = pd.DataFrame(preference_data)

# Convert all values to integers
preference_df = preference_df.astype(int)

# Rename columns for clarity
preference_df.columns = [f'pref_{i}' for i in range(preference_df.shape[1])]

# Display the processed preference data

# Remove the first two columns if they contain metadata
preference_df = preference_df.drop(columns=['pref_0', 'pref_1'])

# Rename columns for clarity
preference_df.columns = [f'pref_{i}' for i in range(preference_df.shape[1])]

# Display the cleaned preference data
print(preference_df.shape)

(5000, 10)
