In [1]:
# import pandas as pd

# # Load the dataset (change the separator depending on your file format)
# data = pd.read_csv('Electronics.txt', sep='\t', header=None)

# # Display the first 20 rows
# data.head(50)

import pandas as pd
import numpy as np
def parse_file(filename):
    data = []
    entry = {}

    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:  # Empty line indicates end of one review
                if entry: 
                    data.append(entry)
                    entry = {}
            else:
                parts = line.split(': ', 1)
                if len(parts) == 2:
                    key, value = parts
                    entry[key] = value

    if entry:  
        data.append(entry)  # Add the last entry

    return pd.DataFrame(data)

# Load the data into a DataFrame
df = parse_file('electronics.txt')


In [2]:
# Step 1: Compute b_ij (Helpfulness Score)
def calculate_b_ij(helpfulness):
    try:
        helpful_votes, total_votes = map(int, helpfulness.split('/'))
        return (helpful_votes ** 2) / total_votes if total_votes > 0 else 0
    except ValueError:
        return 0

df['b_ij'] = df['review/helpfulness'].apply(calculate_b_ij)

# Compute sum of b_xj for each product (i.e., sum of helpfulness scores for all reviews of the same product)
df['sum_b_xj'] = df.groupby('product/productId')['b_ij'].transform('sum')

# Compute h_ij using the correct summation
df['h_ij'] = np.where(df['sum_b_xj'] > 0, df['b_ij'] / df['sum_b_xj'], 0)

# Step 2: Rank reviews by recency (most recent first)
df['rank'] = df.groupby('product/productId')['review/time'].rank(ascending=False)

# Step 3: Compute z_ij efficiently
df['z_ij'] = 1 / (df['rank'] ** 2)

# Step 4: Normalize most_ij
sum_z_xj = df.groupby('product/productId', as_index=False)['z_ij'].agg('sum').rename(columns={'z_ij': 'sum_z'})
df = df.merge(sum_z_xj, on='product/productId')
df['most_ij'] = df['z_ij'] / df['sum_z']

# Step 5: Compute q_ij, avoiding division by zero
# df['q_ij'] = np.where(df['b_ij'] > 0, (1 / df['b_ij']**2) * (len(df) - df['rank']), 0)
n_reviews_per_product = df.groupby('product/productId')['product/productId'].transform('count')
df['q_ij'] = np.where(df['b_ij'] > 0, (1 / df['b_ij']**2) * (n_reviews_per_product - df['rank']), 0)


# Step 6: Normalize top_ij
sum_q_xj = df.groupby('product/productId', as_index=False)['q_ij'].agg('sum').rename(columns={'q_ij': 'sum_q'})
df = df.merge(sum_q_xj, on='product/productId')
df['top_ij'] = df['q_ij'] / df['sum_q']

# Step 7: Combine scores with alpha
alpha = 0.5
df['d_ij'] = alpha * df['top_ij'] + (1 - alpha) * df['most_ij']

# Step 8: Calculate rh_ij
df['rh_ij'] = (df['d_ij'] + df['h_ij'])/2

# Display results
print(df[['product/title','product/productId', 'b_ij', 'z_ij', 'most_ij', 'q_ij', 'top_ij', 'd_ij','rh_ij']].head(30))

                                        product/title product/productId  \
0   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
1   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
2   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
3   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
4   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
5   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
6   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
7   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
8   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
9   Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
10  Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
11  Kodak Max K2000 Battery Charger with 4 NiMH AA...        B0000630MQ   
12  Kodak Max K2000 Batte

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

# **Step 1: Filter Active Users**
user_interaction_counts = df['review/userId'].value_counts()
active_users = user_interaction_counts[user_interaction_counts >= 3].index

df = df[df['review/userId'].isin(active_users)]  # Keep only active users

# Encode users & items
users = df['review/userId'].unique()
items = df['product/productId'].unique()

user_map = {user: i for i, user in enumerate(users)}
item_map = {item: i for i, item in enumerate(items)}

num_users = len(users)
num_items = len(items)

# **Fix: More stable initialization**
def train_svd_sgd(df, num_users, num_items, k=50, lr=0.01, reg=0.1, epochs=20):
    """Train an SVD model using SGD with stability fixes"""
    
    # Initialize latent factors with better scaling
    U = np.random.normal(scale=1./np.sqrt(k), size=(num_users, k))
    V = np.random.normal(scale=1./np.sqrt(k), size=(num_items, k))

    for epoch in range(epochs):
        total_loss = 0
        count = 0  # Track valid updates

        with tqdm(total=len(df), desc=f"Epoch {epoch+1}/{epochs}") as pbar:
            for _, row in df.iterrows():
                i = user_map.get(row['review/userId'])
                j = item_map.get(row['product/productId'])
                true_rating = row['rh_ij']

                # **Fix: Skip invalid data**
                if i is None or j is None or np.isnan(true_rating):
                    continue

                # Predict rating
                pred = np.dot(U[i, :], V[j, :].T)

                # **Fix: Skip NaN or Inf predictions**
                if np.isnan(pred) or np.isinf(pred):
                    continue

                error = true_rating - pred

                # **Fix: More stable update**
                U[i, :] += lr * (error * V[j, :] - reg * U[i, :])
                V[j, :] += lr * (error * U[i, :] - reg * V[j, :])

                total_loss += error**2
                count += 1
                pbar.update(1)

        # **Fix: Prevent NaN RMSE**
        rmse = np.sqrt(total_loss / count) if count > 0 else float('nan')

        print(f"Epoch {epoch+1}: RMSE = {rmse:.4f}")

    return U, V

# **Fix: Drop missing values**
df = df.dropna(subset=['rh_ij'])

# Train SVD
U, V = train_svd_sgd(df, num_users, num_items, k=50, epochs=50)

Epoch 1/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 13030.09it/s]


Epoch 1: RMSE = 0.1660


Epoch 2/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12875.31it/s]


Epoch 2: RMSE = 0.1560


Epoch 3/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12609.70it/s]


Epoch 3: RMSE = 0.1489


Epoch 4/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12668.21it/s]


Epoch 4: RMSE = 0.1431


Epoch 5/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12939.79it/s]


Epoch 5: RMSE = 0.1381


Epoch 6/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 13056.68it/s]


Epoch 6: RMSE = 0.1338


Epoch 7/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12609.53it/s]


Epoch 7: RMSE = 0.1300


Epoch 8/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12767.20it/s]


Epoch 8: RMSE = 0.1265


Epoch 9/50: 100%|███████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12638.79it/s]


Epoch 9: RMSE = 0.1233


Epoch 10/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12892.76it/s]


Epoch 10: RMSE = 0.1205


Epoch 11/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12706.36it/s]


Epoch 11: RMSE = 0.1178


Epoch 12/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12912.57it/s]


Epoch 12: RMSE = 0.1154


Epoch 13/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12850.13it/s]


Epoch 13: RMSE = 0.1131


Epoch 14/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12575.38it/s]


Epoch 14: RMSE = 0.1110


Epoch 15/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12771.06it/s]


Epoch 15: RMSE = 0.1090


Epoch 16/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 13061.49it/s]


Epoch 16: RMSE = 0.1072


Epoch 17/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12822.28it/s]


Epoch 17: RMSE = 0.1054


Epoch 18/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12880.15it/s]


Epoch 18: RMSE = 0.1038


Epoch 19/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12802.65it/s]


Epoch 19: RMSE = 0.1023


Epoch 20/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12767.89it/s]


Epoch 20: RMSE = 0.1008


Epoch 21/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12791.83it/s]


Epoch 21: RMSE = 0.0995


Epoch 22/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12740.57it/s]


Epoch 22: RMSE = 0.0982


Epoch 23/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12784.28it/s]


Epoch 23: RMSE = 0.0970


Epoch 24/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12604.68it/s]


Epoch 24: RMSE = 0.0958


Epoch 25/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12458.62it/s]


Epoch 25: RMSE = 0.0947


Epoch 26/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12869.49it/s]


Epoch 26: RMSE = 0.0937


Epoch 27/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12888.57it/s]


Epoch 27: RMSE = 0.0927


Epoch 28/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12896.33it/s]


Epoch 28: RMSE = 0.0917


Epoch 29/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12692.97it/s]


Epoch 29: RMSE = 0.0909


Epoch 30/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12978.25it/s]


Epoch 30: RMSE = 0.0900


Epoch 31/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12823.23it/s]


Epoch 31: RMSE = 0.0892


Epoch 32/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12919.45it/s]


Epoch 32: RMSE = 0.0884


Epoch 33/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12815.69it/s]


Epoch 33: RMSE = 0.0877


Epoch 34/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 13003.99it/s]


Epoch 34: RMSE = 0.0870


Epoch 35/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12696.09it/s]


Epoch 35: RMSE = 0.0864


Epoch 36/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12661.46it/s]


Epoch 36: RMSE = 0.0857


Epoch 37/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12791.99it/s]


Epoch 37: RMSE = 0.0851


Epoch 38/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12743.99it/s]


Epoch 38: RMSE = 0.0845


Epoch 39/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12732.14it/s]


Epoch 39: RMSE = 0.0840


Epoch 40/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12857.04it/s]


Epoch 40: RMSE = 0.0835


Epoch 41/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12960.96it/s]


Epoch 41: RMSE = 0.0830


Epoch 42/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:33<00:00, 11184.93it/s]


Epoch 42: RMSE = 0.0825


Epoch 43/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12726.44it/s]


Epoch 43: RMSE = 0.0820


Epoch 44/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12985.14it/s]


Epoch 44: RMSE = 0.0816


Epoch 45/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12582.93it/s]


Epoch 45: RMSE = 0.0812


Epoch 46/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12585.43it/s]


Epoch 46: RMSE = 0.0808


Epoch 47/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12719.00it/s]


Epoch 47: RMSE = 0.0804


Epoch 48/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12708.17it/s]


Epoch 48: RMSE = 0.0800


Epoch 49/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:29<00:00, 12646.52it/s]


Epoch 49: RMSE = 0.0797


Epoch 50/50: 100%|██████████████████████████████████████████████████████████| 372260/372260 [00:28<00:00, 12841.77it/s]

Epoch 50: RMSE = 0.0794





In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# Reverse item mapping for fast lookup
rev_item_map = {v: k for k, v in item_map.items()}

def compare_actual_vs_predicted(U, V, df, user_map, rev_item_map, k=10, sample_size=5000):
    """Compare actual vs. predicted recommendations and compute Hit Ratio @K (with Sampling)"""
    
    # 🚀 **Randomly sample users for faster computation**
    sampled_users = np.random.choice(list(user_map.keys()), size=min(sample_size, len(user_map)), replace=False)

    hits = 0
    total_users = 0

    for user in tqdm(sampled_users, desc="Computing Hit Ratio"):
        i = user_map[user]

        # Get actual top-K items based on rh_ij
        user_interactions = df[df['review/userId'] == user]
        actual_items = set(user_interactions.nlargest(k, 'rh_ij')['product/productId'])

        if len(actual_items) == 0:
            continue  # Skip users with no interactions

        total_users += 1

        # **🚀 Compute Predictions Faster**
        predicted_scores = np.dot(U[i, :], V.T)

        # **Efficient Top-K Selection**
        top_k_indices = np.argsort(predicted_scores)[-k:][::-1]  # Sorted top-K
        top_predicted_items = {rev_item_map[idx] for idx in top_k_indices if idx in rev_item_map}

        # **Hit Calculation**
        hits += len(actual_items & top_predicted_items) > 0

    return hits / total_users if total_users > 0 else 0  # Hit Ratio @K

# **🚀 Compute Hit Ratio @10 on a Sample**
hit_ratio_10 = compare_actual_vs_predicted(U, V, df, user_map, rev_item_map, k=10, sample_size=5000)
print(f"\n🎯 Hit Ratio @10 (Sampled 5000 Users): {hit_ratio_10:.4f}")


Computing Hit Ratio: 100%|█████████████████████████████████████████████████████████| 5000/5000 [05:31<00:00, 15.08it/s]


🎯 Hit Ratio @10 (Sampled 5000 Users): 0.0126





In [5]:
num_ratings = len(df)  # Total number of ratings in the dataset
num_users = len(df['review/userId'].unique())  # Total unique users
num_items = len(df['product/productId'].unique())  # Total unique items

# Total possible interactions (user-item matrix size)
total_possible_ratings = num_users * num_items  

# Compute sparsity
sparsity = 1 - (num_ratings / total_possible_ratings)

print(f"📉 Data Sparsity: {sparsity:.4f} ({sparsity * 100:.2f}% missing)")


📉 Data Sparsity: 0.9999 (99.99% missing)


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select relevant features
features = ['b_ij','h_ij', 'most_ij', 'top_ij', 'd_ij']
target = 'rh_ij'  # The value we want to predict

# Drop NaN values (important for MLP)
df = df.dropna(subset=[target] + features)

# Extract X (features) and y (target)
X = df[features].values
y = df[target].values

# Split into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features for better MLP training
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define MLP model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # First hidden layer
    Dense(32, activation='relu'),  # Second hidden layer
    Dense(1, activation='linear')  # Output layer (regression)
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m29567/29567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2ms/step - loss: 7.3933e-04 - mae: 0.0039 - val_loss: 2.7626e-07 - val_mae: 2.3795e-04
Epoch 2/50
[1m29567/29567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 2ms/step - loss: 1.4131e-06 - mae: 4.3777e-04 - val_loss: 8.2440e-08 - val_mae: 1.1929e-04
Epoch 3/50
[1m29567/29567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 2ms/step - loss: 6.1451e-07 - mae: 2.7937e-04 - val_loss: 5.1139e-08 - val_mae: 1.1367e-04
Epoch 4/50
[1m29567/29567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2ms/step - loss: 4.9198e-07 - mae: 2.4818e-04 - val_loss: 2.1173e-07 - val_mae: 3.5383e-04
Epoch 5/50
[1m29567/29567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2ms/step - loss: 6.9896e-07 - mae: 1.9864e-04 - val_loss: 1.0970e-07 - val_mae: 3.2123e-05
Epoch 6/50
[1m29567/29567[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2ms/step - loss: 4.4649e-07 - mae: 1.6982e-04 - val_loss:

In [13]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae}")

# Predict rh_ij for test set
y_pred = model.predict(X_test)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse}")

# # Add predictions to the DataFrame
# df_test = df.iloc[y_test.index].copy()
# df_test['predicted_rh_ij'] = y_pred
# df_test['rmse'] = rmse  # Store RMSE in dataframe if needed


[1m7392/7392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 1.1557e-13 - mae: 2.0467e-08
Test MAE: 2.0657433097426292e-08
[1m7392/7392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step
Test RMSE: 3.8679513430250546e-07


In [15]:
y_pred = model.predict(X_test)


[1m7392/7392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step


In [10]:
import numpy as np

# Convert to numpy arrays
y_test_np = np.array(y_test).flatten()
y_pred_np = y_pred.flatten()

# Define K (e.g., Top 10)
K = 10

# Get indices of the top-K predicted values
top_k_pred_indices = np.argsort(y_pred_np)[-K:]

# Get indices of the top-K actual values
top_k_actual_indices = np.argsort(y_test_np)[-K:]

# Compute hits (intersection between top-K predicted and actual)
hits = np.intersect1d(top_k_pred_indices, top_k_actual_indices)

# Compute Hit Ratio
hit_ratio = len(hits) / K
print(f"Hit Ratio @ {K}: {hit_ratio:.4f}")


Hit Ratio @ 10: 0.4000


In [12]:
# Check how many values are non-zero
num_users = df['review/userId'].nunique()
num_items = df['product/productId'].nunique()
num_interactions = len(df)  # Each row is an interaction

# Calculate sparsity
total_possible_interactions = num_users * num_items
sparsity = 1 - (num_interactions / total_possible_interactions)

print(f"Sparsity: {sparsity:.4f} (or {sparsity * 100:.2f}%)")


Sparsity: 0.9999 (or 99.99%)


In [15]:
def hybrid_recommend(user_id, df, svd_model, mlp_model, num_recommendations=5):
    all_products = df['product/productId'].unique()
    
    # Predict rh_ij using SVD
    svd_predictions = [svd_model.predict(user_id, product_id) for product_id in all_products]
    svd_predictions.sort(key=lambda x: x.est, reverse=True)
    top_products_svd = [pred.iid for pred in svd_predictions[:num_recommendations]]
    
    # Use MLP model to refine ranking
    top_products_mlp = df[df['product/productId'].isin(top_products_svd)].copy()
    X_mlp = scaler.transform(top_products_mlp[['b_ij', 'h_ij', 'most_ij', 'top_ij', 'd_ij']].values)
    top_products_mlp['mlp_rhij'] = mlp_model.predict(X_mlp)
    
    # Sort by combined score (SVD + MLP)
    top_products_mlp = top_products_mlp.sort_values(by='mlp_rhij', ascending=False)
    
    return top_products_mlp[['product/productId', 'product/title', 'mlp_rhij']].head(num_recommendations)


In [18]:
pd.set_option('display.max_colwidth', None)  # Prevent truncation
print(df['product/title'])


0           Kodak Max K2000 Battery Charger with 4 NiMH AA Batteries
1           Kodak Max K2000 Battery Charger with 4 NiMH AA Batteries
3           Kodak Max K2000 Battery Charger with 4 NiMH AA Batteries
5           Kodak Max K2000 Battery Charger with 4 NiMH AA Batteries
6           Kodak Max K2000 Battery Charger with 4 NiMH AA Batteries
                                     ...                            
1241769                                    3Com Audrey Web Appliance
1241770                                    3Com Audrey Web Appliance
1241772                                    3Com Audrey Web Appliance
1241774                                    3Com Audrey Web Appliance
1241777    Sony ACCCSP Starter Kit for DSCP3 & DSCP5 Digital Cameras
Name: product/title, Length: 372260, dtype: object


In [20]:
import pandas as pd
import numpy as np

df= pd.read_csv('output_sentiment.csv.gz', compression='gzip')

# Display the first 10 rows
# print(df.head(50))
print(df.columns.values)

['product/productId' 'product/title' 'product/price' 'review/userId'
 'review/profileName' 'review/helpfulness' 'review/score' 'review/time'
 'review/summary' 'review/text' 'b_ij' 'sum_b_xj' 'h_ij' 'rank' 'z_ij'
 'sum_z' 'most_ij' 'q_ij' 'sum_q' 'top_ij' 'd_ij' 'rh_ij'
 'sentiment_score']


In [21]:
df = df.dropna(subset=['rh_ij'])


In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from scipy.sparse import dok_matrix

# **Step 1: Filter Active Users**
user_interaction_counts = df['review/userId'].value_counts()
active_users = user_interaction_counts[user_interaction_counts >= 3].index

df = df[df['review/userId'].isin(active_users)]  # Keep only active users

# Encode users & items
users = df['review/userId'].unique()
items = df['product/productId'].unique()

user_map = {user: i for i, user in enumerate(users)}
item_map = {item: i for i, item in enumerate(items)}

num_users = len(users)
num_items = len(items)

# **Convert DataFrame to Numpy Array for Faster Access**
ratings = df[['review/userId', 'product/productId', 'rh_ij']].to_numpy()

# **SVD with Optimized SGD**
def train_svd_sgd(df, num_users, num_items, k=50, lr=0.01, reg=0.1, epochs=20, batch_size=128):
    """Train an SVD model using MiniBatch SGD with stability fixes."""
    
    # Initialize latent factors
    U = np.random.normal(scale=1./np.sqrt(k), size=(num_users, k))
    V = np.random.normal(scale=1./np.sqrt(k), size=(num_items, k))

    # Min & Max Ratings (for Clipping Predictions)
    min_rating = df['rh_ij'].min()
    max_rating = df['rh_ij'].max()

    for epoch in range(epochs):
        np.random.shuffle(ratings)  # Shuffle dataset per epoch

        total_loss = 0
        count = 0  # Track valid updates

        with tqdm(total=len(ratings), desc=f"Epoch {epoch+1}/{epochs}") as pbar:
            for i in range(0, len(ratings), batch_size):
                batch = ratings[i:i+batch_size]

                for row in batch:
                    user_id, item_id, true_rating = row
                    i = user_map.get(user_id)
                    j = item_map.get(item_id)

                    if i is None or j is None or np.isnan(true_rating):
                        continue  # Skip invalid entries

                    # Predict rating
                    pred = np.dot(U[i, :], V[j, :].T)
                    pred = np.clip(pred, min_rating, max_rating)  # Constrain predictions

                    # Compute error
                    error = true_rating - pred

                    # Update latent factors
                    U[i, :] += lr * (error * V[j, :] - reg * U[i, :])
                    V[j, :] += lr * (error * U[i, :] - reg * V[j, :])

                    total_loss += error**2
                    count += 1
                    pbar.update(1)

        # **Decay Learning Rate**
        lr *= 0.95

        # **Compute RMSE (Handle Edge Cases)**
        rmse = np.sqrt(total_loss / count) if count > 0 else float('nan')

        print(f"Epoch {epoch+1}: RMSE = {rmse:.4f}")

    return U, V

# **Fix: Drop missing values**
df = df.dropna(subset=['rh_ij'])

# Train SVD
U, V = train_svd_sgd(df, num_users, num_items, k=50, epochs=50, batch_size=512)


Epoch 1/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 31815.66it/s]


Epoch 1: RMSE = 0.1255


Epoch 2/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 32786.44it/s]


Epoch 2: RMSE = 0.1204


Epoch 3/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:12<00:00, 30242.15it/s]


Epoch 3: RMSE = 0.1169


Epoch 4/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 32599.16it/s]


Epoch 4: RMSE = 0.1143


Epoch 5/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 31659.96it/s]


Epoch 5: RMSE = 0.1122


Epoch 6/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 34822.34it/s]


Epoch 6: RMSE = 0.1104


Epoch 7/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33178.22it/s]


Epoch 7: RMSE = 0.1089


Epoch 8/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 36446.17it/s]


Epoch 8: RMSE = 0.1076


Epoch 9/50:  95%|████████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 36847.44it/s]


Epoch 9: RMSE = 0.1064


Epoch 10/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 34496.74it/s]


Epoch 10: RMSE = 0.1054


Epoch 11/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33156.70it/s]


Epoch 11: RMSE = 0.1045


Epoch 12/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33495.63it/s]


Epoch 12: RMSE = 0.1036


Epoch 13/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33108.13it/s]


Epoch 13: RMSE = 0.1029


Epoch 14/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33539.91it/s]


Epoch 14: RMSE = 0.1022


Epoch 15/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 33944.40it/s]


Epoch 15: RMSE = 0.1015


Epoch 16/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33223.59it/s]


Epoch 16: RMSE = 0.1009


Epoch 17/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33425.07it/s]


Epoch 17: RMSE = 0.1004


Epoch 18/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 34067.55it/s]


Epoch 18: RMSE = 0.0999


Epoch 19/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33541.43it/s]


Epoch 19: RMSE = 0.0994


Epoch 20/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33454.63it/s]


Epoch 20: RMSE = 0.0990


Epoch 21/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33012.97it/s]


Epoch 21: RMSE = 0.0986


Epoch 22/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 32962.19it/s]


Epoch 22: RMSE = 0.0982


Epoch 23/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33574.67it/s]


Epoch 23: RMSE = 0.0978


Epoch 24/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33239.73it/s]


Epoch 24: RMSE = 0.0975


Epoch 25/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33220.34it/s]


Epoch 25: RMSE = 0.0972


Epoch 26/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 34308.08it/s]


Epoch 26: RMSE = 0.0969


Epoch 27/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 36651.79it/s]


Epoch 27: RMSE = 0.0966


Epoch 28/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 36936.15it/s]


Epoch 28: RMSE = 0.0964


Epoch 29/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 37023.47it/s]


Epoch 29: RMSE = 0.0962


Epoch 30/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 35924.98it/s]


Epoch 30: RMSE = 0.0959


Epoch 31/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33639.18it/s]


Epoch 31: RMSE = 0.0957


Epoch 32/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33368.14it/s]


Epoch 32: RMSE = 0.0955


Epoch 33/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33271.81it/s]


Epoch 33: RMSE = 0.0953


Epoch 34/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 32083.50it/s]


Epoch 34: RMSE = 0.0952


Epoch 35/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:12<00:00, 30858.20it/s]


Epoch 35: RMSE = 0.0950


Epoch 36/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 33904.45it/s]


Epoch 36: RMSE = 0.0949


Epoch 37/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 33894.10it/s]


Epoch 37: RMSE = 0.0947


Epoch 38/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33108.01it/s]


Epoch 38: RMSE = 0.0946


Epoch 39/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33044.39it/s]


Epoch 39: RMSE = 0.0945


Epoch 40/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:12<00:00, 30973.33it/s]


Epoch 40: RMSE = 0.0943


Epoch 41/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 32336.73it/s]


Epoch 41: RMSE = 0.0942


Epoch 42/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33816.58it/s]


Epoch 42: RMSE = 0.0941


Epoch 43/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33622.50it/s]


Epoch 43: RMSE = 0.0940


Epoch 44/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33274.31it/s]


Epoch 44: RMSE = 0.0939


Epoch 45/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:11<00:00, 33390.67it/s]


Epoch 45: RMSE = 0.0938


Epoch 46/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 34022.55it/s]


Epoch 46: RMSE = 0.0937


Epoch 47/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 35553.43it/s]


Epoch 47: RMSE = 0.0937


Epoch 48/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 36360.08it/s]


Epoch 48: RMSE = 0.0936


Epoch 49/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 35324.51it/s]


Epoch 49: RMSE = 0.0935


Epoch 50/50:  95%|███████████████████████████████████████████████████████   | 372260/391886 [00:10<00:00, 34523.51it/s]

Epoch 50: RMSE = 0.0934





In [20]:
def compute_hit_ratio_svd(U, V, df, user_map, item_map, k=10):
    """Compute Hit Ratio @K using per-user processing to reduce memory usage."""
    
    rev_user_map = {v: k for k, v in user_map.items()}
    rev_item_map = {v: k for k, v in item_map.items()}
    
    unique_users = list(user_map.values())  # Get all user indices
    hits = 0
    total_users = 0

    for user in tqdm(unique_users, desc="Processing Users"):
        actual_items = set(df[df['review/userId'] == rev_user_map[user]]
                           .nlargest(k, 'rh_ij')['product/productId'])
        
        if len(actual_items) == 0:
            continue  # Skip users with no interactions

        total_users += 1

        # **Compute predictions only for this user**
        predicted_scores = np.dot(U[user, :], V.T)
        top_k_indices = np.argpartition(predicted_scores, -k)[-k:]  # Faster than full sort
        top_predicted_items = {rev_item_map[idx] for idx in top_k_indices if idx in rev_item_map}

        hits += len(actual_items & top_predicted_items) > 0

    return hits / total_users if total_users > 0 else 0

# 🚀 **Run the Single-User Optimized Hit Ratio Calculation**
hit_ratio_10 = compute_hit_ratio_svd(U, V, df, user_map, item_map, k=10)
print(f"\n🎯 Hit Ratio @10: {hit_ratio_10:.4f}")


Processing Users: 100%|████████████████████████████████████████████████████████| 70090/70090 [1:20:50<00:00, 14.45it/s]


🎯 Hit Ratio @10: 0.0003



