# <center>retail eCommerce - Content Based Recommender system</center>

In [1]:
# Import Librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Data Exploration & Preprocessing :

In [3]:
df_1 = pd.read_csv('../Documents/Data/item_properties_part1.csv')
df_2 = pd.read_csv('../Documents/Data/item_properties_part2.csv')

In [4]:
# merge the two dataframes
df = pd.concat([df_1, df_2])

In [5]:
df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20275902 entries, 0 to 9275902
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   timestamp  int64 
 1   itemid     int64 
 2   property   object
 3   value      object
dtypes: int64(2), object(2)
memory usage: 773.5+ MB


### Information about Data representation :
To maintain data integrity and privacy, all values in the "item_properties.csv" file, excluding "categoryid"
and "available" properties, underwent a hashing process. The "categoryid" property contains item category
identifiers, while the "available" property indicates item availability, with "1" signifying availability
and "0" denoting unavailability. Numerical values were prefixed with "n" and displayed with three-digit
precision after the decimal point. For example, "5" becomes "n5.000," and "-3.67584" becomes "n-3.675."

Text values underwent normalization through a stemming procedure, following a process similar to stemming.
These normalized words were then hashed, and numbers were processed as described above. For instance,
the text "Hello world 2017!" would be transformed into "24214 44214 n2017.000" to ensure consistent and
standardized representation of text-based properties.

### Reduce Data Size :
We notice that this dataframe contains more than 20M row which too big, so to reduce the Dataset size and also the cost of training :
For each item's property,  we are going to keep just the last one based on the timestamp

In [7]:
N = len(df)
print('Initial Data length :',N)

Initial Data length : 20275902


In [8]:
# Sort the DataFrame by 'timestamp' in descending order
df.sort_values(by='timestamp', ascending=False, inplace=True)

# Group by 'itemid' and 'property' and get the most recent value for each group
df = df.groupby(['itemid', 'property'])['value'].first().reset_index()

In [9]:
# New df
df.head()

Unnamed: 0,itemid,property,value
0,0,1036,1276750
1,0,1056,n3.168 1144008
2,0,11,n15360.000 628176 n12288.000
3,0,112,679677
4,0,127,1168476


In [10]:
# Drop missing values
df = df.dropna()
df.dropna().isna().sum()

itemid      0
property    0
value       0
dtype: int64

In [11]:
# Amount of reduce :
n1 = len(df)
r = n1/N
print('The amount we kept from the original dataset is :',str(round(r*100,2))+'%')

The amount we kept from the original dataset is : 59.2%


In [12]:
# Number of items
print('Number of unique items :',len(df['itemid'].unique()))

Number of unique items : 417053


**417K** items is a big number, which won't allow us to work efficiently, so for demonstration and ressources reasons we are going to keep just **10K** item to build our recommendation system.

In [13]:
ids = pd.Series(df['itemid'].unique())
reduced_ids = ids.sample(10000, ignore_index=True)
reduced_ids.head()

0    348094
1     25834
2    364736
3     15700
4    123593
dtype: int64

We go back now to our dataframe to keep just properties for our random selected items

In [14]:
# Use .isin() to create a boolean mask
mask = df['itemid'].isin(reduced_ids)

# Apply the mask to filter the DataFrame
reduced_df = df[mask]

reduced_df.reset_index(inplace=True, drop=True)

reduced_df.head()

Unnamed: 0,itemid,property,value
0,24,1031,861995
1,24,112,679677
2,24,13,n96.000
3,24,159,519769
4,24,202,150169 119273 n96.000 571064 249204 1191692 19...


In [15]:
reduced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288172 entries, 0 to 288171
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   itemid    288172 non-null  int64 
 1   property  288172 non-null  object
 2   value     288172 non-null  object
dtypes: int64(1), object(2)
memory usage: 6.6+ MB


In [16]:
# New Amount of reduce :
n2 = len(reduced_df)
r = n2/N
print('The new amount we kept from the original dataset is :',str(round(r*100,2))+'%')

The new amount we kept from the original dataset is : 1.42%


### Preprocessing

- For numerical values, values that start with 'n', we will convert them to float values.
- For text values, we will create such embeddings vectors, for example :
"Hello world 2017!" would be transformed into "24214 44214 n2017.000",
then to : [24214, 44214] (numbers eliminated), finally we aggregate this values using mean value.

In [17]:
def preprocessing(x):
    # text data
    if len(x.split(' ')) > 1:
        return np.array([float(elt) for elt in x.split(' ') if not elt.startswith('n')]).mean()
    # numerical values
    else:
        if x.startswith('n'):
            return float(x[1:])
        else:
            return float(x)

In [18]:
reduced_df['value'] = reduced_df['value'].apply(preprocessing)

In [19]:
reduced_df.head()

Unnamed: 0,itemid,property,value
0,24,1031,861995.0
1,24,112,679677.0
2,24,13,96.0
3,24,159,519769.0
4,24,202,523210.428571


### Creating the DataFrame :
Now we will create a new Dataframe using properties as features.

In [20]:
numItems = len(reduced_df['itemid'].unique())
numCols = len(reduced_df['property'].unique())
print('Number of unique items :', numItems)
print('Number of unique properties :', numCols)

Number of unique items : 10000
Number of unique properties : 1000


In [21]:
# Pivot the DataFrame to reshape it
pivot_df = reduced_df.pivot(index='itemid', columns='property', values='value')

In [22]:
pivot_df.reset_index(inplace=True)
pivot_df = pivot_df.rename_axis(None, axis=1)

In [23]:
pivot_df.head()

Unnamed: 0,itemid,0,1,10,100,1000,1001,1002,1003,1004,...,992,993,994,995,996,997,998,999,available,categoryid
0,24,,,,,,,,,,...,,,412879.5,,,,,,0.0,1244.0
1,32,,,,,,,,,,...,,881499.0,,,,,,,0.0,1173.0
2,341,,,,,,,,,,...,,,,,,,,,0.0,1203.0
3,346,,,,,,,,,,...,,,,,,,,,1.0,806.0
4,374,,,,,,,,,,...,,,,,,,,,0.0,13.0


In [24]:
# Fill missing values with 0
pivot_df = pivot_df.fillna(0)

In [25]:
# We will remove the 'available' column since we know that it has no influence on user preferences
pivot_df.drop(columns=['available'], inplace=True)

In [26]:
pivot_df.head()

Unnamed: 0,itemid,0,1,10,100,1000,1001,1002,1003,1004,...,991,992,993,994,995,996,997,998,999,categoryid
0,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,412879.5,0.0,0.0,0.0,0.0,0.0,1244.0
1,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,881499.0,0.0,0.0,0.0,0.0,0.0,0.0,1173.0
2,341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1203.0
3,346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,806.0
4,374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0


In [27]:
pivot_df.shape

(10000, 1000)

In [28]:
pivot_df.describe()

Unnamed: 0,itemid,0,1,10,100,1000,1001,1002,1003,1004,...,991,992,993,994,995,996,997,998,999,categoryid
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,234675.3258,18582.81,13967.29,1675.04,364.7551,6532.44255,0.2664,446.6772,291.1062,7164.309,...,30.951,2240.0918,15456.01,24474.41,223.3386,87.4679,6633.668,769.0764,670.0158,836.2053
std,135217.658458,117072.6,127451.2,43219.05,17612.08,77224.819771,6.192905,22330.51,16805.343339,75128.1,...,564.265312,31730.169019,109045.6,136640.6,15791.63,8495.401601,73515.02,24308.927872,27346.44,473.855762
min,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,116205.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,374.0
50%,235716.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,844.0
75%,352417.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1250.0
max,466826.0,1250968.0,1200832.0,1116693.0,1133979.0,946867.5,168.0,1116693.0,970354.0,1155283.0,...,10317.0,769062.0,1246919.0,1336541.0,1116693.0,849515.0,1333885.0,769062.0,1116693.0,1694.0


We will apply PCA to this matrix in order to reduce dimensionnlaity, with a new Vector Space with 3 dim. 

In [29]:
X = pivot_df.drop(columns=['itemid']).to_numpy()

In [30]:
pca = PCA(n_components=4)
pca.fit(X)
print(pca.explained_variance_ratio_)

[9.99998849e-01 1.15142410e-06 6.88435667e-13 7.25498107e-17]


In [31]:
# We see that we didn't loose much information, 
# the first Principal Component itself preserve 0.9 of information and variance

In [32]:
new_df = pd.concat([pivot_df[['itemid']], pd.DataFrame(pca.transform(X))], axis=1)

In [33]:
new_df.head()

Unnamed: 0,itemid,0,1,2,3
0,24,-73431700000000.0,-952288100000.0,-126370000.0,-832108.347096
1,32,-73431700000000.0,-952288100000.0,-126436300.0,-949808.022292
2,341,-73431700000000.0,-952288100000.0,-126179800.0,-870708.315217
3,346,-73431700000000.0,-952288100000.0,-126198100.0,-831900.329934
4,374,-73431700000000.0,-952288100000.0,-126819000.0,-831703.790518


In [34]:
new_df.describe()

Unnamed: 0,itemid,0,1,2,3
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,234675.3258,-0.0048,-0.00041875,-1.281738e-07,-1.470089e-07
std,135217.658458,7117619000000000.0,7637525000000.0,5905636000.0,60625190.0
min,24.0,-73431700000000.0,-952288800000.0,-127110000.0,-35602650.0
25%,116205.25,-73431700000000.0,-952288100000.0,-126451700.0,-865477.4
50%,235716.5,-73431700000000.0,-952288100000.0,-126147300.0,-831895.4
75%,352417.25,-73431700000000.0,-952288100000.0,-125830800.0,-831570.6
max,466826.0,7.116131e+17,106173400000000.0,339452500000.0,5412398000.0


In [35]:
# We will now remove outliers values from our dataframe in order not to disturb our future model :
# Define a function to remove outliers using IQR
def remove_outliers_iqr(df, column, lower_percentile=0.25, upper_percentile=0.75, threshold=1.5):
    q1 = df[column].quantile(lower_percentile)
    q3 = df[column].quantile(upper_percentile)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each column
for column in new_df.columns:
    new_df = remove_outliers_iqr(new_df, column)

In [36]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5870 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   itemid  5870 non-null   int64  
 1   0       5870 non-null   float64
 2   1       5870 non-null   float64
 3   2       5870 non-null   float64
 4   3       5870 non-null   float64
dtypes: float64(4), int64(1)
memory usage: 275.2 KB


In [37]:
new_df.reset_index(drop=True, inplace=True)

In [38]:
new_df

Unnamed: 0,itemid,0,1,2,3
0,24,-7.343170e+13,-9.522881e+11,-1.263700e+08,-832108.347096
1,32,-7.343170e+13,-9.522881e+11,-1.264363e+08,-949808.022292
2,341,-7.343170e+13,-9.522881e+11,-1.261798e+08,-870708.315217
3,346,-7.343170e+13,-9.522881e+11,-1.261981e+08,-831900.329934
4,374,-7.343170e+13,-9.522881e+11,-1.268190e+08,-831703.790518
...,...,...,...,...,...
5865,466559,-7.343170e+13,-9.522881e+11,-1.262370e+08,-878860.851106
5866,466597,-7.343170e+13,-9.522881e+11,-1.263649e+08,-832020.966816
5867,466720,-7.343170e+13,-9.522881e+11,-1.265088e+08,-828207.288759
5868,466809,-7.343170e+13,-9.522881e+11,-1.257019e+08,-831781.104478


In [39]:
# more than 2K row was deleted since they were considered as outliers
# We will apply a min max scaler so values will fall between 0 and 1

In [40]:
scaler = MinMaxScaler(feature_range=(0,1))

In [41]:
X = new_df.drop(columns=['itemid']).to_numpy()

In [42]:
std_X = scaler.fit_transform(X)

In [43]:
# since all columns were hashed, we don't need to recover columns names we will just oreder them starting from 1, ...
new_df = pd.concat([new_df[['itemid']], pd.DataFrame(std_X)], axis=1)

In [44]:
new_df.head()

Unnamed: 0,itemid,0,1,2,3
0,24,0.580582,0.6297,0.386993,0.622813
1,32,0.269411,0.625671,0.352327,0.037557
2,341,0.478068,0.522461,0.486456,0.430876
3,346,0.194144,0.501312,0.476841,0.623847
4,374,0.100824,0.797913,0.152209,0.624824


In [45]:
new_df.describe()

Unnamed: 0,itemid,0,1,2,3
count,5870.0,5870.0,5870.0,5870.0,5870.0
mean,233974.765588,0.324786,0.501108,0.49173,0.58235
std,135962.012024,0.183306,0.215261,0.238272,0.162371
min,24.0,0.0,0.0,0.0,0.0
25%,114986.5,0.177994,0.349258,0.330146,0.623173
50%,234153.0,0.328459,0.499176,0.49293,0.623975
75%,353066.5,0.470588,0.64592,0.65869,0.624908
max,466826.0,1.0,1.0,1.0,1.0


## 2. Model Building :

In [46]:
# Before creating a function, I create a pandas series called indices.
# Indices will match the index with the itemid
indices = pd.Series(np.arange(0, len(new_df)), index=new_df['itemid'])
indices

itemid
24           0
32           1
341          2
346          3
374          4
          ... 
466559    5865
466597    5866
466720    5867
466809    5868
466826    5869
Length: 5870, dtype: int32

In [47]:
# Create the matrix
matrix = new_df.drop(columns=['itemid']).to_numpy()

In [48]:
# Compute the cosine similarity between items
cosine_sim = cosine_similarity(matrix, matrix)

In [49]:
# The function get_recommendations will receive the itemid, cosine similarity matrix, and a number of 
# the recommended items as the inputs. Then, it will return a list of recommended items.

def get_recommendations(itemid, cosine_sim=cosine_sim, num_recommend = 5):
    idx = indices[itemid]
    # Get the pairwsie similarity scores of all items with that item
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 5 most similar items
    top_similar = sim_scores[1:num_recommend+1]
    # Get the item indices
    item_indices = [i[0] for i in top_similar]
    # Return the top 10 most similar movies
    return new_df['itemid'].iloc[item_indices].to_numpy()

In [50]:
# Demonstration
itemid = indices.index[0]
print('Top 5 items similar to item',itemid,':')
get_recommendations(itemid)

Top 5 items similar to item 24 :


array([328539,  41314,  62021, 157038, 306801], dtype=int64)

## 3. User Profile :

In [51]:
events = pd.read_csv('../Documents/Data/events.csv')
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [52]:
len(events['visitorid'].unique())

1407580

In [53]:
# We will first remove 'timestamp' and 'transactionid' columns, since we won't use them to build the user profile.
events.drop(columns=['timestamp', 'transactionid'], inplace=True)

In [54]:
events.head()

Unnamed: 0,visitorid,event,itemid
0,257597,view,355908
1,992329,view,248676
2,111016,view,318965
3,483717,view,253185
4,951259,view,367447


In [55]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   visitorid  int64 
 1   event      object
 2   itemid     int64 
dtypes: int64(2), object(1)
memory usage: 63.1+ MB


In [56]:
# Event types
events['event'].unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

### Inspect user behavior :

In [57]:
events[events['event']=='transaction'].iloc[0]

visitorid         599528
event        transaction
itemid            356475
Name: 130, dtype: object

In [58]:
# So lets investigate how user 599528 has interacted with item 356475 before purchasing it
events[(events['visitorid']==599528) & (events['itemid']==356475)]

Unnamed: 0,visitorid,event,itemid
112,599528,addtocart,356475
130,599528,transaction,356475
16729,599528,view,356475
29505,599528,view,356475
114163,599528,view,356475
145498,599528,view,356475
171995,599528,view,356475
187497,599528,view,356475
192471,599528,view,356475
197272,599528,view,356475


**Conclusion :**

The user has interacted with this item multiple times before buying it

**Approach :**

We will use a rating scale from 1 to 5, view : 1, addToCart: 5, transaction: 10
We will keep just the highest rating for each pair (user, item) since we won't need the low values because each transaction follow these steps : view -> addToCart -> transaction

In [59]:
events['event'] = events['event'].map({'view': 1, 'addtocart': 5, 'transaction': 10})

As we are using the user interaction as a rating we will keep just the high value, for example, if a user has viewed an item before buying it, we will keep just the buy event which matches a 5 rating.
This will help us reducing the data size, and also working with accurate values.

In [60]:
# Sort the DataFrame by 'event' in descending order
events.sort_values(by='event', ascending=False, inplace=True)

# Group by 'itemid' and 'visitorid' and get the most recent value for each group
events = events.groupby(['visitorid', 'itemid'])['event'].first().reset_index()

In [61]:
events = events.rename(columns={'event': 'rating'})

In [62]:
events.head()

Unnamed: 0,visitorid,itemid,rating
0,0,67045,1
1,0,285930,1
2,0,357564,1
3,1,72028,1
4,2,216305,1


In [63]:
events.groupby('visitorid')['itemid'].count().sort_values(ascending=False)

visitorid
1150086    3814
530559     2209
892013     1738
895999     1641
152963     1622
           ... 
522671        1
522670        1
522669        1
522668        1
1407579       1
Name: itemid, Length: 1407580, dtype: int64

In [64]:
# We notice that there is some users who 's interacted with many products, otherwise, someones only interacted with few ones

In [65]:
# We will keep just interactions with items that we have their properties, the 5000 instances sample we picked

In [66]:
# Use .isin() to create a boolean mask
mask = events['itemid'].isin(new_df['itemid'].unique())

# Apply the mask to filter the DataFrame
events_2 = events[mask]

events_2.reset_index(inplace=True, drop=True)

events_2.head()

Unnamed: 0,visitorid,itemid,rating
0,10,248766,1
1,51,358388,1
2,53,217218,1
3,64,160984,1
4,151,48731,1


In [67]:
events_2.groupby('rating').count()['visitorid']

rating
1     28573
5       581
10      262
Name: visitorid, dtype: int64

**N.B :**
We notice that our Dataset is not balanced betwwen different rating values, which can add some bias to our model.

We will create a function that takes as input : visitorid, it generate his rating vector, then using the property-item matrix, it generate his User Profile, this function will serve as to predict the user rating for an item he didn't interact with.

In [75]:
def predict_rating(visitorid, itemid):
    user_ratings = [(row['itemid'], row['rating']) for index, row in events_2[events_2['visitorid']==visitorid].iterrows()]
    # print('Number of items rated by visitor',visitorid,':',len(user_ratings))

    rated_items = [i[1] for i in user_ratings]
    item_ids = [i[0] for i in user_ratings]
    
    mask = new_df['itemid'].isin(item_ids)
    items_props = new_df[mask].drop(columns=['itemid']).to_numpy()
    
    user_profile = np.dot(rated_items, items_props)
    user_profile = user_profile/user_profile.sum()
    # print('User Profile array :')
    # print(user_profile)
    
    new_item_props = new_df[new_df['itemid']==itemid].drop(columns=['itemid']).to_numpy()[0]
    # print('item',itemid,'properties :')
    # print(new_item_props)
    predicted_rating = np.dot(user_profile, new_item_props)
    # Scale the input value to the 1-10 range
    scaled_rating = (10 - 1) * predicted_rating + 1
    # scaled_rating = predicted_rating * 10
    # print('Predicted rating for item',itemid,'by visitor',visitorid,':')
    # print(round(scaled_rating,3))
    return round(scaled_rating,3)

In [76]:
print('Items preserved :')
indices

Items preserved :


itemid
24           0
32           1
341          2
346          3
374          4
          ... 
466559    5865
466597    5866
466720    5867
466809    5868
466826    5869
Length: 5870, dtype: int32

In [77]:
print('visitors preserved :')
events_2['visitorid']

visitors preserved :


0             10
1             51
2             53
3             64
4            151
          ...   
29411    1407335
29412    1407340
29413    1407374
29414    1407439
29415    1407451
Name: visitorid, Length: 29416, dtype: int64

In [78]:
# Demonstration
visitorid = events_2['visitorid'][0]
itemid = indices.index[0]
print('Predicted rating for item',itemid,'by visitor',visitorid,':',predict_rating(visitorid, itemid))

Predicted rating for item 24 by visitor 10 : 5.94


In [79]:
events_2['predicted_rating'] = events_2.apply(lambda row: predict_rating(row['visitorid'], row['itemid']), axis=1)
events_2.head()

Unnamed: 0,visitorid,itemid,rating,predicted_rating
0,10,248766,1,5.095
1,51,358388,1,5.705
2,53,217218,1,5.661
3,64,160984,1,5.822
4,151,48731,1,5.591


### Model Evaluation :

We will evaluate our model using the two famous regression metrics, RMSE(root mean squared error) and R2, Note that this evaluation will be applied only on training set, and that's our model wasn't following a trainig method, it was just based on some rapid calculations.

In [80]:
# RMSE
from sklearn.metrics import mean_squared_error

y_true = events_2['rating']
y_pred = events_2['predicted_rating']

rmse = mean_squared_error(y_true, y_pred, squared=False)
print('RMSE score :',rmse)

RMSE score : 5.143741470605461


In [81]:
# R_squared score
from sklearn.metrics import r2_score

r2 = r2_score(y_true, y_pred)
print('R2 score :',r2)

R2 score : -25.1410098481893


### Conclusion :
Our model is not that well due to many reasons, first our dataset size is not big enough to create such a model, we've reduced its size for ressources reasons, second, our dataset is not balanced, we do not have enough data about many users, the majority of users has just one rating, which adds a lot of bias to our model.