<a href="https://colab.research.google.com/github/AbhijnaKalbhag/Mapping-Recommendations-to-A-star/blob/main/reccomendations_to_grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162730 sha256=b7cfd88d4a7335e10cc84fb64b52605179f358eccbbd02d6342903a003a4bed5
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Define number of users and products
num_users = 100
num_products = 50

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate ratings matrix with random ratings (from 0 to 5) and NaN values
ratings = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], size=(num_users, num_products))

# Create DataFrame from the generated data
data = {'User': user_ids}
for i, product_id in enumerate(product_ids):
    data[product_id] = ratings[:, i]

df = pd.DataFrame(data)

# Print the ratings matrix
print("Ratings Matrix:")
print(df)
print()

# Load the data into the Surprise dataset format
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df.melt(id_vars=['User'], var_name='Item', value_name='Rating').dropna(), reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Use Singular Value Decomposition (SVD) algorithm
algo = SVD()

# Train the model
algo.fit(trainset)

# Predict ratings for the test set
predictions = algo.test(testset)

# Example usage to predict whether User1 will like Product1
user_id = 'User1'
item_id = 'Product1'
prediction = algo.predict(user_id, item_id)
print(f"Predicted rating for {user_id} on {item_id}: {prediction.est}")


Ratings Matrix:
       User  Product1  Product2  Product3  Product4  Product5  Product6  \
0     User1       NaN       3.0       4.0       NaN       2.0       4.0   
1     User2       4.0       2.0       NaN       4.0       0.0       NaN   
2     User3       5.0       NaN       3.0       0.0       5.0       4.0   
3     User4       3.0       5.0       2.0       2.0       0.0       2.0   
4     User5       5.0       0.0       4.0       5.0       3.0       3.0   
..      ...       ...       ...       ...       ...       ...       ...   
95   User96       3.0       1.0       3.0       2.0       5.0       4.0   
96   User97       NaN       3.0       NaN       3.0       1.0       3.0   
97   User98       5.0       1.0       1.0       0.0       1.0       4.0   
98   User99       3.0       5.0       0.0       3.0       2.0       3.0   
99  User100       5.0       4.0       5.0       3.0       0.0       4.0   

    Product7  Product8  Product9  ...  Product41  Product42  Product43  \
0        

In [None]:
import pandas as pd
import numpy as np

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Define number of users and features
num_users = 100
num_features = 5  # Example number of features

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate random feature data
feature_data = np.random.randint(0, 2, size=(num_users, num_features))  # Example binary feature data

# Create DataFrame for users and features
user_feature_df = pd.DataFrame(feature_data, columns=[f'Feature{i}' for i in range(1, num_features + 1)], index=user_ids)

# Print the users and features matrix
print("Users and Features Matrix:")
print(user_feature_df)


Users and Features Matrix:
         Feature1  Feature2  Feature3  Feature4  Feature5
User1           0         1         0         0         0
User2           1         0         0         0         1
User3           0         0         0         0         1
User4           0         1         1         1         0
User5           1         0         1         1         1
...           ...       ...       ...       ...       ...
User96          1         0         1         0         1
User97          1         1         0         1         0
User98          1         0         1         0         0
User99          1         0         0         1         0
User100         0         0         1         0         1

[100 rows x 5 columns]


In [None]:
import pandas as pd
import numpy as np

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Define number of products and attributes
num_products = 50
num_attributes = 3  # Example number of attributes

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate random attribute data
attribute_data = np.random.randint(0, 100, size=(num_products, num_attributes))  # Example numerical attribute data

# Create DataFrame for products and attributes
product_attribute_df = pd.DataFrame(attribute_data, columns=[f'Attribute{i}' for i in range(1, num_attributes + 1)], index=product_ids)

# Print the products and attributes matrix
print("Products and Attributes Matrix:")
print(product_attribute_df)


Products and Attributes Matrix:
           Attribute1  Attribute2  Attribute3
Product1           51          92          14
Product2           71          60          20
Product3           82          86          74
Product4           74          87          99
Product5           23           2          21
Product6           52           1          87
Product7           29          37           1
Product8           63          59          20
Product9           32          75          57
Product10          21          88          48
Product11          90          58          41
Product12          91          59          79
Product13          14          61          61
Product14          46          61          50
Product15          54          63           2
Product16          50           6          20
Product17          72          38          17
Product18           3          88          59
Product19          13           8          89
Product20          52           1          83
Pr

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Sample dataframes
user_product_df = pd.DataFrame({
    'User': ['User1', 'User1', 'User2', 'User3'],
    'Product': ['Product1', 'Product2', 'Product1', 'Product3'],
    'Rating': [5, 4, 3, 2]  # Example ratings
})

user_feature_df = pd.DataFrame({
    'User': ['User1', 'User2', 'User3'],
    'Feature1': [1, 0, 1],  # Example binary features
    'Feature2': [0, 1, 1],
    'Feature3': [1, 1, 0],
    'Feature4': [0, 1, 0],
    'Feature5': [1, 0, 0]
})

product_attribute_df = pd.DataFrame({
    'Product': ['Product1', 'Product2', 'Product3'],
    'Attribute1': [50, 30, 70],  # Example numerical attributes
    'Attribute2': [80, 20, 50],
    'Attribute3': [10, 90, 40]
})

# Merge user-product with product-attribute dataframe
merged_df = pd.merge(user_product_df, product_attribute_df, on='Product')

# Merge with user-feature dataframe
merged_df = pd.merge(merged_df, user_feature_df, on='User')

# Function to calculate ratings based on user features and product attributes
def calculate_rating(row):
    # Example calculation (you can replace this with your own method)
    rating = (row['Feature1'] * row['Attribute1'] +
              row['Feature2'] * row['Attribute2'] +
              row['Feature3'] * row['Attribute3']) / 100  # Normalized to 0-1 range
    return rating

# Apply the function to calculate ratings
merged_df['Rating'] = merged_df.apply(calculate_rating, axis=1)

# Normalize the ratings
scaler = MinMaxScaler()
merged_df['Rating'] = scaler.fit_transform(merged_df['Rating'].values.reshape(-1, 1))

# Pivot table to create user-product matrix
user_product_matrix = pd.pivot_table(merged_df, values='Rating', index='User', columns='Product', fill_value=0)

print("Normalized User-Product Matrix:")
print(user_product_matrix)


Normalized User-Product Matrix:
Product  Product1  Product2  Product3
User                                 
User1         0.0         1         0
User2         0.5         0         0
User3         0.0         0         1


In [None]:
import pandas as pd
import numpy as np

# Define number of users, products, features, and attributes
num_users = 100
num_products = 50
num_features = 5
num_attributes = 3

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate feature data for users
feature_data = np.random.randint(0, 2, size=(num_users, num_features))  # Example binary feature data
user_feature_df = pd.DataFrame(feature_data, columns=[f'Feature{i}' for i in range(1, num_features + 1)], index=user_ids)

# Generate attribute data for products
attribute_data = np.random.randint(0, 100, size=(num_products, num_attributes))  # Example numerical attribute data
product_attribute_df = pd.DataFrame(attribute_data, columns=[f'Attribute{i}' for i in range(1, num_attributes + 1)], index=product_ids)

# Generate random ratings matrix for user-product interactions
ratings = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], size=(num_users, num_products))
user_product_df = pd.DataFrame(ratings, columns=product_ids, index=user_ids)

# Print the generated dataframes
print("Users and Products DataFrame:")
print(user_product_df.head())

print("\nUsers and Features DataFrame:")
print(user_feature_df.head())

print("\nProducts and Attributes DataFrame:")
print(product_attribute_df.head())


Users and Products DataFrame:
       Product1  Product2  Product3  Product4  Product5  Product6  Product7  \
User1       1.0       2.0       1.0       0.0       4.0       4.0       3.0   
User2       0.0       NaN       3.0       1.0       4.0       1.0       4.0   
User3       0.0       2.0       3.0       0.0       NaN       3.0       5.0   
User4       5.0       4.0       1.0       0.0       1.0       2.0       1.0   
User5       NaN       0.0       2.0       4.0       3.0       5.0       0.0   

       Product8  Product9  Product10  ...  Product41  Product42  Product43  \
User1       NaN       1.0        0.0  ...        NaN        0.0        1.0   
User2       2.0       4.0        3.0  ...        3.0        0.0        5.0   
User3       NaN       3.0        2.0  ...        NaN        3.0        0.0   
User4       1.0       4.0        4.0  ...        4.0        1.0        3.0   
User5       NaN       3.0        0.0  ...        4.0        5.0        5.0   

       Product44  Product4

In [None]:
import pandas as pd
import numpy as np

# Define number of users and features
num_users = 100
num_features = 5

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate numerical feature data for users
feature_data = np.random.randint(18, 65, size=(num_users, num_features))  # Example numerical feature data (age)
user_feature_df = pd.DataFrame(feature_data, columns=[f'Feature{i}' for i in range(1, num_features + 1)], index=user_ids)

# Print the users and features dataframe
print("Users and Features DataFrame:")
print(user_feature_df)


Users and Features DataFrame:
         Feature1  Feature2  Feature3  Feature4  Feature5
User1          56        46        32        60        25
User2          38        56        36        40        28
User3          28        41        53        57        41
User4          20        39        19        41        61
User5          47        55        19        38        50
...           ...       ...       ...       ...       ...
User96         40        27        22        53        51
User97         48        27        36        49        18
User98         22        62        21        33        41
User99         33        19        45        49        44
User100        37        41        29        52        50

[100 rows x 5 columns]


In [None]:
import pandas as pd
import numpy as np

# Define number of users, products, features, and attributes
num_users = 100
num_products = 10
num_features = 5
num_attributes = 3

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate user features with a combination of categorical and numerical values
user_features = {
    'Gender': np.random.choice(['Male', 'Female'], size=num_users),
    'Age': np.random.randint(18, 65, size=num_users),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], size=num_users),
    'Interest': np.random.choice(['Fashion', 'Technology', 'Food', 'Travel'], size=num_users)
}
user_feature_df = pd.DataFrame(user_features, index=user_ids)

# Generate product attributes with numerical values
product_attributes = {
    'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], size=num_products),
    'Brand': np.random.choice(['Brand1', 'Brand2', 'Brand3'], size=num_products),
    'Price': np.random.randint(10, 1000, size=num_products)
}
product_attribute_df = pd.DataFrame(product_attributes, index=product_ids)

# Generate random ratings matrix for user-product interactions
ratings = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], size=(num_users, num_products))
user_product_df = pd.DataFrame(ratings, columns=product_ids, index=user_ids)

# Print the generated dataframes
print("Users and Products DataFrame:")
print(user_product_df.head())

print("\nUsers and Features DataFrame:")
print(user_feature_df.head())

print("\nProducts and Attributes DataFrame:")
print(product_attribute_df.head())


Users and Products DataFrame:
       Product1  Product2  Product3  Product4  Product5  Product6  Product7  \
User1       NaN       0.0       2.0       1.0       0.0       1.0       NaN   
User2       5.0       1.0       NaN       5.0       NaN       1.0       NaN   
User3       1.0       0.0       0.0       0.0       2.0       5.0       NaN   
User4       1.0       2.0       1.0       0.0       4.0       3.0       1.0   
User5       4.0       3.0       0.0       5.0       5.0       3.0       2.0   

       Product8  Product9  Product10  
User1       5.0       1.0        2.0  
User2       2.0       1.0        1.0  
User3       4.0       1.0        NaN  
User4       NaN       0.0        3.0  
User5       NaN       NaN        3.0  

Users and Features DataFrame:
       Gender  Age  Location    Interest
User1    Male   35  Suburban        Food
User2  Female   43     Urban        Food
User3    Male   61     Urban      Travel
User4    Male   51  Suburban      Travel
User5    Male   27  Subur

In [None]:
import pandas as pd
import numpy as np

# Define number of users, products, features, and attributes
num_users = 100
num_products = 10
num_features = 5
num_attributes = 3

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate user features with a combination of categorical and numerical values
user_features = {
    'Gender': np.random.choice(['Male', 'Female'], size=num_users),
    'Age': np.random.randint(18, 65, size=num_users),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], size=num_users),
    'Interest': np.random.choice(['Fashion', 'Technology', 'Food', 'Travel'], size=num_users)
}
user_feature_df = pd.DataFrame(user_features, index=user_ids)

# Generate product attributes with numerical values
product_attributes = {
    'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], size=num_products),
    'Brand': np.random.choice(['Brand1', 'Brand2', 'Brand3'], size=num_products),
    'Price': np.random.randint(10, 1000, size=num_products)
}
product_attribute_df = pd.DataFrame(product_attributes, index=product_ids)

# Generate random ratings matrix for user-product interactions
ratings = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], size=(num_users, num_products))
user_product_df = pd.DataFrame(ratings, columns=product_ids, index=user_ids)

# Print the generated dataframes
print("Users and Products DataFrame:")
print(user_product_df.head())

print("\nUsers and Features DataFrame:")
print(user_feature_df.head())

print("\nProducts and Attributes DataFrame:")
print(product_attribute_df.head())


user_item_interactions_df =user_product_df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))


interaction_weight = 0.5

# Calculate item-user association
item_user_association = user_item_interactions_df.apply(lambda x: x * interaction_weight, axis=0)

# Create the value grid
value_grid = pd.DataFrame(index=user_item_interactions_df.columns, columns=user_item_interactions_df.index)
for item in value_grid.index:
    value_grid.loc[item] = item_user_association[item]
print("\n*\n")
print("Value Grid \n")
print("*\n")
# Display the value grid
print(value_grid)

Users and Products DataFrame:
       Product1  Product2  Product3  Product4  Product5  Product6  Product7  \
User1       NaN       0.0       2.0       1.0       0.0       1.0       NaN   
User2       5.0       1.0       NaN       5.0       NaN       1.0       NaN   
User3       1.0       0.0       0.0       0.0       2.0       5.0       NaN   
User4       1.0       2.0       1.0       0.0       4.0       3.0       1.0   
User5       4.0       3.0       0.0       5.0       5.0       3.0       2.0   

       Product8  Product9  Product10  
User1       5.0       1.0        2.0  
User2       2.0       1.0        1.0  
User3       4.0       1.0        NaN  
User4       NaN       0.0        3.0  
User5       NaN       NaN        3.0  

Users and Features DataFrame:
       Gender  Age  Location    Interest
User1    Male   35  Suburban        Food
User2  Female   43     Urban        Food
User3    Male   61     Urban      Travel
User4    Male   51  Suburban      Travel
User5    Male   27  Subur

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load data
users_products = pd.DataFrame({
    "User": ["User1", "User2", "User3", "User4", "User5"],
    "Product1": [None, 5, 1, 1, 4],
    "Product2": [0, 1, 0, 2, 3],
    "Product3": [2, None, 0, 1, 0],
    # Add other product columns...
})

# Fill missing values with zeros
users_products = users_products.fillna(0)

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(users_products.drop(columns="User"))

# Convert similarity matrix to DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, columns=users_products['User'], index=users_products['User'])

# Print cosine similarity matrix
print(cosine_sim_df)


User      User1     User2     User3     User4     User5
User                                                   
User1  1.000000  0.000000  0.000000  0.408248  0.000000
User2  0.000000  1.000000  0.980581  0.560449  0.902134
User3  0.000000  0.980581  1.000000  0.408248  0.800000
User4  0.408248  0.560449  0.408248  1.000000  0.816497
User5  0.000000  0.902134  0.800000  0.816497  1.000000


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Generate random data for Users and Products
users_products_df = pd.DataFrame(np.random.randint(0, 6, size=(5, 10)), columns=[f"Product{i}" for i in range(1, 11)])
users_products_df.insert(0, "User", [f"User{i}" for i in range(1, 6)])

# Generate random data for Users and Features
users_features_df = pd.DataFrame({
    "User": [f"User{i}" for i in range(1, 6)],
    "Gender": np.random.choice(["Male", "Female"], size=5),
    "Age": np.random.randint(20, 70, size=5),
    "Location": np.random.choice(["Urban", "Suburban", "Rural"], size=5),
    "Interest": np.random.choice(["Food", "Travel", "Technology"], size=5)
})

# Generate random data for Products and Attributes
products_attributes_df = pd.DataFrame({
    "Product": [f"Product{i}" for i in range(1, 6)],
    "Category": np.random.choice(["Electronics", "Books", "Food", "Clothing"], size=5),
    "Brand": np.random.choice(["Brand1", "Brand2", "Brand3"], size=5),
    "Price": np.random.randint(50, 1000, size=5)
})

# Merge Users and Products DataFrame with Users and Features DataFrame
merged_df = pd.merge(users_products_df, users_features_df, on="User")

# Encoding categorical variables
label_encoder = LabelEncoder()
merged_df['Gender'] = label_encoder.fit_transform(merged_df['Gender'])
merged_df['Location'] = label_encoder.fit_transform(merged_df['Location'])
merged_df['Interest'] = label_encoder.fit_transform(merged_df['Interest'])

# Exclude the 'User' column before applying imputation
imputer = SimpleImputer(strategy='mean')
merged_df_numeric = merged_df.drop(columns=['User'])
merged_df_imputed = pd.DataFrame(imputer.fit_transform(merged_df_numeric), columns=merged_df_numeric.columns)

# Calculate cosine similarity
customer_similarity = cosine_similarity(merged_df_imputed)

# Convert similarity matrix to DataFrame
customer_similarity_df = pd.DataFrame(customer_similarity, columns=merged_df['User'], index=merged_df['User'])

# Print similarity matrix
print(customer_similarity_df)


User      User1     User2     User3     User4     User5
User                                                   
User1  1.000000  0.956002  0.940127  0.978957  0.979518
User2  0.956002  1.000000  0.979891  0.963774  0.943902
User3  0.940127  0.979891  1.000000  0.958773  0.939586
User4  0.978957  0.963774  0.958773  1.000000  0.986245
User5  0.979518  0.943902  0.939586  0.986245  1.000000


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define the number of users and products
num_users = 100
num_products = 10

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate random ratings matrix for user-product interactions
ratings = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], size=(num_users, num_products))

# Convert ratings into a DataFrame
ratings_df = pd.DataFrame(ratings, columns=product_ids, index=user_ids)

# Generate product attributes with numerical values
product_attributes = {
    'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], size=num_products),
    'Brand': np.random.choice(['Brand1', 'Brand2', 'Brand3'], size=num_products),
    'Price': np.random.randint(10, 1000, size=num_products)
}

# Convert product attributes into a DataFrame
product_attributes_df = pd.DataFrame(product_attributes, index=product_ids)

# Step 1: Calculate the average ratings for each category based on user ratings
category_ratings = ratings_df.merge(product_attributes_df[['Category']], left_index=True, right_index=True)
category_ratings = category_ratings.groupby('Category').mean()

# Step 2: Map each product to its corresponding category
product_to_category = product_attributes_df['Category'].to_dict()

# Step 3: For each user, identify the product with the highest rating in the category and assign that category to the user
user_category_mapping = {}
for user_id, row in ratings_df.iterrows():
    highest_rated_product = row.idxmax()
    category = product_to_category[highest_rated_product]
    user_category_mapping[user_id] = category

# Step 4: Create a DataFrame of users and their corresponding mapped categories
user_category_df = pd.DataFrame(list(user_category_mapping.items()), columns=['User', 'Mapped_Category'])

# Display the DataFrame
print("DataFrame of users and their corresponding mapped categories:")
print(user_category_df)


DataFrame of users and their corresponding mapped categories:
       User Mapped_Category
0     User1            Food
1     User2           Books
2     User3            Food
3     User4            Food
4     User5            Food
..      ...             ...
95   User96     Electronics
96   User97        Clothing
97   User98     Electronics
98   User99            Food
99  User100     Electronics

[100 rows x 2 columns]


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define the number of users and products
num_users = 100
num_products = 10

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate random ratings matrix for user-product interactions
ratings = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], size=(num_users, num_products))

# Convert ratings into a DataFrame
ratings_df = pd.DataFrame(ratings, columns=product_ids, index=user_ids)

# Print the ratings DataFrame
print("Ratings DataFrame:")
print(ratings_df)

# Generate product attributes with numerical values
product_attributes = {
    'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], size=num_products),
    'Brand': np.random.choice(['Brand1', 'Brand2', 'Brand3'], size=num_products),
    'Price': np.random.randint(10, 1000, size=num_products)
}

# Convert product attributes into a DataFrame
product_attributes_df = pd.DataFrame(product_attributes, index=product_ids)

# Print the product attributes DataFrame
print("\nProduct Attributes DataFrame:")
print(product_attributes_df)

# Step 1: Calculate the average ratings for each category based on user ratings
category_ratings = ratings_df.merge(product_attributes_df[['Category']], left_index=True, right_index=True)
category_ratings = category_ratings.groupby('Category').mean()

# Print the category ratings DataFrame
print("\nCategory Ratings DataFrame:")
print(category_ratings)

# Step 2: Map each product to its corresponding category
product_to_category = product_attributes_df['Category'].to_dict()

# Step 3: For each user, identify the product with the highest rating in the category and assign that category to the user
user_category_mapping = {}
for user_id, row in ratings_df.iterrows():
    highest_rated_product = row.idxmax()
    category = product_to_category[highest_rated_product]
    user_category_mapping[user_id] = category

# Step 4: Create a DataFrame of users and their corresponding mapped categories
user_category_df = pd.DataFrame(list(user_category_mapping.items()), columns=['User', 'Mapped_Category'])

# Print the user category DataFrame
print("\nUser Mapped Category DataFrame:")
print(user_category_df)


Ratings DataFrame:
         Product1  Product2  Product3  Product4  Product5  Product6  Product7  \
User1         NaN       3.0       4.0       NaN       2.0       4.0       4.0   
User2         NaN       2.0       2.0       4.0       3.0       2.0       5.0   
User3         5.0       5.0       1.0       3.0       4.0       0.0       3.0   
User4         3.0       0.0       0.0       2.0       2.0       NaN       1.0   
User5         5.0       5.0       NaN       5.0       2.0       3.0       NaN   
...           ...       ...       ...       ...       ...       ...       ...   
User96        NaN       1.0       1.0       1.0       5.0       5.0       2.0   
User97        0.0       5.0       3.0       4.0       5.0       5.0       2.0   
User98        NaN       NaN       NaN       4.0       4.0       5.0       1.0   
User99        5.0       3.0       1.0       5.0       1.0       1.0       1.0   
User100       2.0       NaN       1.0       3.0       0.0       5.0       0.0   

        

In [None]:
import pandas as pd
import numpy as np

# Define number of users, products, features, and attributes
num_users = 100
num_products = 10
num_features = 5
num_attributes = 3

# Generating more realistic data
np.random.seed(42)  # for reproducibility

# Generate user IDs
user_ids = ['User' + str(i) for i in range(1, num_users + 1)]

# Generate product IDs
product_ids = ['Product' + str(i) for i in range(1, num_products + 1)]

# Generate user features with a combination of categorical and numerical values
user_features = {
    'Gender': np.random.choice(['Male', 'Female'], size=num_users),
    'Age': np.random.randint(18, 65, size=num_users),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], size=num_users),
    'Interest': np.random.choice(['Fashion', 'Technology', 'Food', 'Travel'], size=num_users)
}
user_feature_df = pd.DataFrame(user_features, index=user_ids)

# Generate product attributes with numerical values
product_attributes = {
    'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], size=num_products),
    'Brand': np.random.choice(['Brand1', 'Brand2', 'Brand3'], size=num_products),
    'Price': np.random.randint(10, 1000, size=num_products)
}
product_attribute_df = pd.DataFrame(product_attributes, index=product_ids)

# Generate random ratings matrix for user-product interactions
ratings = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], size=(num_users, num_products))
user_product_df = pd.DataFrame(ratings, columns=product_ids, index=user_ids)

# Print the generated dataframes
print("Users and Products DataFrame:")
print(user_product_df.head())

print("\nUsers and Features DataFrame:")
print(user_feature_df.head())

print("\nProducts and Attributes DataFrame:")
print(product_attribute_df.head())

# Step 1: Identify the highest-rated products for each user
highest_rated_products = user_product_df.idxmax(axis=1)

# Step 2: Determine the most repeated category among these highest-rated products for each user
user_most_common_category = {}
for user_id, product_id in highest_rated_products.items():
    if isinstance(product_id, str):
        product_id = [product_id]  # Convert single product_id to list
    product_categories = product_attribute_df.loc[product_id, 'Category']
    most_common_category = product_categories.mode()[0]  # Calculate mode of categories
    user_most_common_category[user_id] = most_common_category


# Step 3: Create a DataFrame for each user and their most likely category
user_category_df = pd.DataFrame.from_dict(user_most_common_category, orient='index', columns=['Most_Likely_Category'])

# Display the DataFrame
print("DataFrame for each user and their most likely category:")
print(user_category_df)


Users and Products DataFrame:
       Product1  Product2  Product3  Product4  Product5  Product6  Product7  \
User1       NaN       0.0       2.0       1.0       0.0       1.0       NaN   
User2       5.0       1.0       NaN       5.0       NaN       1.0       NaN   
User3       1.0       0.0       0.0       0.0       2.0       5.0       NaN   
User4       1.0       2.0       1.0       0.0       4.0       3.0       1.0   
User5       4.0       3.0       0.0       5.0       5.0       3.0       2.0   

       Product8  Product9  Product10  
User1       5.0       1.0        2.0  
User2       2.0       1.0        1.0  
User3       4.0       1.0        NaN  
User4       NaN       0.0        3.0  
User5       NaN       NaN        3.0  

Users and Features DataFrame:
       Gender  Age  Location    Interest
User1    Male   35  Suburban        Food
User2  Female   43     Urban        Food
User3    Male   61     Urban      Travel
User4    Male   51  Suburban      Travel
User5    Male   27  Subur

In [None]:
# Function to get top 3 categories for a given user
def get_top_categories(user_id):
    highest_rated_products = user_product_df.loc[user_id].nlargest(3).index
    top_categories = product_attribute_df.loc[highest_rated_products, 'Category'].value_counts().index[:3]
    return top_categories

# Display top 3 categories for each user
for user_id in user_ids:
    top_categories = get_top_categories(user_id)
    print(f"Top 3 categories for {user_id}: {', '.join(top_categories)}")


Top 3 categories for User1: Books, Electronics
Top 3 categories for User2: Books, Electronics
Top 3 categories for User3: Books, Food
Top 3 categories for User4: Books, Food
Top 3 categories for User5: Books, Food, Electronics
Top 3 categories for User6: Books, Food
Top 3 categories for User7: Books, Food, Electronics
Top 3 categories for User8: Books, Electronics
Top 3 categories for User9: Books, Electronics
Top 3 categories for User10: Electronics, Food
Top 3 categories for User11: Books, Food
Top 3 categories for User12: Books, Electronics
Top 3 categories for User13: Electronics, Books
Top 3 categories for User14: Books, Food
Top 3 categories for User15: Food, Books
Top 3 categories for User16: Food, Books, Electronics
Top 3 categories for User17: Electronics, Books, Food
Top 3 categories for User18: Books, Food
Top 3 categories for User19: Electronics, Books
Top 3 categories for User20: Books, Electronics
Top 3 categories for User21: Books, Electronics
Top 3 categories for User22