In [2]:
import numpy as np
import pandas as pd



In [3]:
# Loading the dataset
# TODO

train = pd.read_csv("data/amazon_train.csv")
valid = pd.read_csv('data/amazon_valid.csv')
test = pd.read_csv('data/amazon_test.csv')
train.head()

Unnamed: 0,UserID,ProductID,Rating,Timestamp
0,A3HICVLF4PFFMN,594481813,5.0,2014-05-05
1,A2QBZA4S1ROX9Q,594481813,3.0,2013-05-25
2,AT09WGFUM934H,594481813,3.0,2013-08-31
3,AGAKHE014LQFU,594481813,3.0,2013-09-18
4,A1S6B5QFWGVL5U,594481813,4.0,2013-06-27


### Preprocessing 

In [4]:
from imblearn.over_sampling import RandomOverSampler

sampling_strategy = {1: round(train['Rating'].value_counts()[1]*2.5),
                     2: round(train['Rating'].value_counts()[2]*3),
                     3: round(train['Rating'].value_counts()[3]*2)}

ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(train[['UserID', 'ProductID']], train['Rating'])

train_ros = pd.DataFrame(X_train_ros, columns=['UserID', 'ProductID'])
train_ros['Rating'] = y_train_ros

train_ros.head()

Unnamed: 0,UserID,ProductID,Rating
0,A3HICVLF4PFFMN,594481813,5.0
1,A2QBZA4S1ROX9Q,594481813,3.0
2,AT09WGFUM934H,594481813,3.0
3,AGAKHE014LQFU,594481813,3.0
4,A1S6B5QFWGVL5U,594481813,4.0


In [5]:
train = train_ros

from sklearn.preprocessing import LabelEncoder

le_user = LabelEncoder()
train['UserEnc'] = le_user.fit_transform(train['UserID'].values)

le_product = LabelEncoder()
train['ProductEnc'] = le_product.fit_transform(train['ProductID'].values)

valid['UserEnc'] = le_user.transform(valid['UserID'].values)
valid['ProductEnc'] = le_product.transform(valid['ProductID'].values)

test['UserEnc'] = le_user.transform(test['UserID'].values)
test['ProductEnc'] = le_product.transform(test['ProductID'].values)

In [6]:
from sklearn.preprocessing import MinMaxScaler

# Scaling the Rating column
# TODO

scaler = MinMaxScaler()
train['Rating'] = scaler.fit_transform(train['Rating'].values.reshape(-1, 1))
valid['Rating'] = scaler.transform(valid['Rating'].values.reshape(-1, 1))

### Model 

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense



num_users = train['UserEnc'].nunique()
num_products = train['ProductEnc'].nunique()
embedding_size = 200

# User embedding input and layers
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embedding')(user_input)
user_vector = Flatten(name='user_vector')(user_embedding)

# Product embedding input and layers
product_input = Input(shape=(1,), name='product_input')
product_embedding = Embedding(input_dim=num_products, output_dim=embedding_size, name='product_embedding')(product_input)
product_vector = Flatten(name='product_vector')(product_embedding)

# Dot product of user and product vectors
dot_product = Dot(axes=1, name='dot_product')([user_vector, product_vector])

# Output layer with sigmoid activation
output = Dense(1, activation='sigmoid', name='output')(dot_product)

# Create the final model
model = Model(inputs=[user_input, product_input], outputs=output, name='user_product_model')



In [8]:
# Training phase
# TODO
# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


In [9]:
user_ids = train['UserEnc'].values
product_ids = train['ProductEnc'].values
ratings = train['Rating'].values


valid_user_ids = valid['UserEnc'].values
valid_product_ids = valid['ProductEnc'].values
valid_ratings = valid['Rating'].values

In [10]:
history = model.fit(
    [user_ids, product_ids],  
    ratings,
    validation_data=([valid_user_ids, valid_product_ids], valid_ratings),  
    epochs=10,
    batch_size=1024
)

Epoch 1/10
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 301ms/step - accuracy: 0.4719 - loss: 0.6473 - val_accuracy: 0.6138 - val_loss: 0.5144
Epoch 2/10
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m375s[0m 295ms/step - accuracy: 0.5791 - loss: 0.3749 - val_accuracy: 0.6123 - val_loss: 0.4853
Epoch 3/10
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m401s[0m 357ms/step - accuracy: 0.5871 - loss: 0.2781 - val_accuracy: 0.6121 - val_loss: 0.4864
Epoch 4/10
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m493s[0m 402ms/step - accuracy: 0.5866 - loss: 0.2786 - val_accuracy: 0.6122 - val_loss: 0.4869
Epoch 5/10
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 364ms/step - accuracy: 0.5861 - loss: 0.2764 - val_accuracy: 0.6129 - val_loss: 0.4870
Epoch 6/10
[1m1123/1123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 315ms/step - accuracy: 0.5875 - loss: 0.2696 - val_accuracy: 0.6124 - val_loss:

##### . Because the score evaluation score is precision the final result is different ,the final score on test data is 83%

In [11]:
user_values_test = test['UserEnc'].values
product_values_test = test['ProductEnc'].values

ID = test['ProductID']
User = test['UserID']

In [14]:
# Preparing submission file
Prediction = model.predict([user_values_test,product_values_test])
rescaled_Prediction = scaler.inverse_transform(Prediction)
if rescaled_Prediction.ndim > 1:
    rescaled_Prediction = rescaled_Prediction.reshape(-1)

# Check shapes of all components to debug
print(f"Shape of User: {User.shape}")
print(f"Shape of ID: {ID.shape}")
print(f"Shape of rescaled_Prediction: {rescaled_Prediction.shape}")

predictions = np.round(rescaled_Prediction)

submission = pd.DataFrame({'UserID':User,'ProductID':ID,'Rating':predictions})
submission

[1m2805/2805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step
Shape of User: (89732,)
Shape of ID: (89732,)
Shape of rescaled_Prediction: (89732,)


Unnamed: 0,UserID,ProductID,Rating
0,A2Y3A341VDK37H,B00HFI55N2,4.0
1,A240FRPD4MEXND,B00KIMX4EY,5.0
2,A3SBTW3WS4IQSN,B001IF252M,4.0
3,A1QCZQTJNK3MU,B00J46VVKE,4.0
4,A16JT7E5121OAB,B00JZC972Q,4.0
...,...,...,...
89727,A37Z65SZVT0TVB,B00005AR4L,4.0
89728,A3PLX6PTM2ERKL,B00004Z6PI,5.0
89729,A3TRPVAGT3NWBS,B00005NVPW,4.0
89730,AY8Q1X7G96HV5,B000059MSI,4.0


In [19]:
submission['Rating'].value_counts()

4.0    48445
5.0    39699
3.0     1491
2.0       95
1.0        2
Name: Rating, dtype: int64