# Weighted Hybryd Recommendation System
- <b>A weighted recommendation system is an approach to creating recommendations where multiple different algorithms or data sources are used, and their results are combined using weighting coefficients. This system allows for considering the strength or importance of each of these sources to create more accurate and personalized recommendations.

In [1]:
# 0. imports
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np


# package for collaborative filtering users-items
from surprise import SVD, SVDpp # SVD-разложение
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.compose import ColumnTransformer # for commiting CountVectorizer on 2 column
from sklearn.metrics import mean_squared_error

In [2]:
# 1. load data
movies = pd.read_csv('dataset/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('dataset/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
tags = pd.read_csv('dataset/tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
# 3. prepare data
tags = tags.groupby(['movieId'])['tag'].apply(lambda x: ' '.join(x.str.replace(' ', '').str.lower()[:3]))
tags.head()

movieId
1                         pixar pixar fun
2    fantasy magicboardgame robinwilliams
3                               moldy old
5                        pregnancy remake
7                                  remake
Name: tag, dtype: object

In [6]:
len(ratings)

100836

In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

movies_with_ratings = ratings.merge(movies, on='movieId')
movies_with_ratings = movies_with_ratings.merge(tags, how='left', on='movieId')

movies_with_ratings['genres'] = movies_with_ratings['genres'].apply(change_string)
movies_with_ratings['tag'] = movies_with_ratings['tag'].apply(lambda x: x if isinstance(x, str) else 'none')
movies_with_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tag
0,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance,moldy old
2,1,6,4.0,964982224,Heat (1995),Action Crime Thriller,none
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery Thriller,mystery twistending serialkiller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime Mystery Thriller,mindfuck suspense thriller


In [8]:
len(movies_with_ratings)

100836

In [9]:
# 4. split data.

# change order
movies_with_ratings = movies_with_ratings.sample(frac=1) 

# split data
ratings_train = movies_with_ratings.iloc[:50000, :]
ratings_test = movies_with_ratings.iloc[50000:90000, :]

ratings_for_final_test = movies_with_ratings.iloc[90000:, :]

# add a bool field in the Tags table
len(ratings_train), len(ratings_test)

(50000, 40000)

In [11]:
# 5. Create RS models 
# Step 1. Create a collaborative filtering rs model

# define dataset_train obj
dataset_train = pd.DataFrame({
    'uid': ratings_train.userId,
    'iid': ratings_train.movieId,
    'rating': ratings_train.rating
})

# define reader and data
reader = Reader(rating_scale=(0.5, 5.0))
train_dataset = Dataset.load_from_df(dataset_train, reader).build_full_trainset()

# init model and fit that
collaborative_filtering_model = SVD(n_factors=20, n_epochs=20)
collaborative_filtering_model.fit(train_dataset)

# Get prediction
test_dataset = list(ratings_test[['userId', 'movieId', 'rating']].itertuples(index=False, name=None))
collaborative_filtering_predictions = pd.DataFrame(collaborative_filtering_model.test(test_dataset))[['uid','iid','r_ui', 'est']]
collaborative_filtering_predictions.rename(columns={'est':'c_f'}, inplace=True)
collaborative_filtering_predictions.head()

Unnamed: 0,uid,iid,r_ui,c_f
0,234,1214,5.0,3.984707
1,602,204,4.0,2.66107
2,45,33495,3.0,3.805262
3,599,7030,2.5,2.626276
4,331,56782,5.0,3.971165


In [12]:
# get rmse for collaborative filtering
rmse = mean_squared_error(collaborative_filtering_predictions['r_ui'] , collaborative_filtering_predictions['c_f'], squared=False)
rmse

0.8808896503144193

In [13]:
# Step 2. Create content-based filtering rs model (on genres)
text_fetures = ['genres', 'tag']
num_features = ['userId', 'movieId', 'rating']


ct = ColumnTransformer([("cv1", CountVectorizer(), 'genres'), ("cv2", CountVectorizer(), 'tag')])
tfidf_vector = TfidfTransformer()


X_train = ct.fit_transform(ratings_train[text_fetures])
X_train = tfidf_vector.fit_transform(X_train)
X_train = pd.DataFrame(X_train.toarray(), columns=ct.get_feature_names_out()).join(ratings_train[num_features].reset_index())

X_test = ct.transform(ratings_test[text_fetures])
X_test = tfidf_vector.transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=ct.get_feature_names_out()).join(ratings_test[num_features].reset_index())

In [14]:
# cute inition 
X_train, X_test, y_train, y_test = X_train.drop(['rating'], axis=1), X_test.drop(['rating'], axis=1), X_train['rating'], X_test['rating']

In [15]:
### Step 3. Bayer's optimize using hyperopt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from hyperopt import hp, rand, fmin, tpe, Trials, STATUS_OK


list_n_estimators = list(range(1,30, 5))
list_criterion = [ 'squared_error'] 
list_max_depth = list(range(2,6,2))
list_min_samples_split = list(range(2,4,1))
list_min_samples_leaf = list(range(2,4,1))


#пространство поиска параметров для hyperopt
search_space = [
    hp.choice(label='n_estimators',options=list_n_estimators),
    hp.choice(label='criterion', options=list_criterion),
    hp.choice(label='max_depth', options=list_max_depth),
    hp.choice(label='min_samples_split', options=list_min_samples_split),
    hp.choice(label='min_samples_leaf', options=list_min_samples_leaf),
]

# sample позвоялет оценить пространство параметров.
from hyperopt.pyll.stochastic import sample

    
#необходимо создать функцию
def objective(params):
  svr = RandomForestRegressor(n_estimators=params[0], criterion=params[1], max_depth=params[2], min_samples_split=params[3], min_samples_leaf=params[4],  random_state=42) 
  scores = cross_val_score(svr, X_train, y_train, cv=3, scoring="neg_mean_squared_error")
  return scores.mean()


tpe_algo = tpe.suggest
tpe_trials = Trials()  

tpe_best = fmin(fn=objective, space=search_space, algo=tpe_algo, trials=tpe_trials, 
                max_evals=3)

print(tpe_best)

100%|██████████| 3/3 [00:10<00:00,  3.52s/trial, best loss: -1.0255175597614374]
{'criterion': 0, 'max_depth': 0, 'min_samples_leaf': 0, 'min_samples_split': 0, 'n_estimators': 0}


In [16]:
### Bagging
from sklearn.ensemble import  BaggingRegressor

bagging = BaggingRegressor(RandomForestRegressor( n_estimators=list_n_estimators[tpe_best.get('n_estimators')],
                                                  criterion=list_criterion[tpe_best.get('criterion')],
                                                  max_depth=list_max_depth[tpe_best.get('max_depth')],
                                                  min_samples_split=list_min_samples_split[tpe_best.get('min_samples_split')],
                                                  min_samples_leaf=list_min_samples_leaf[tpe_best.get('min_samples_leaf')],
                                                  random_state=42
                                                ),
                           n_estimators=10,
                           max_samples=0.5,
                           max_features=0.5,
                           random_state=10)


bagging.fit(X_train, y_train)
predict = bagging.predict(X_test)
mean_squared_error(predict, y_test)

1.0165782238533694

In [17]:
# 6. Create a feature matrix where each row corresponds to a user-item pair and each column corresponds to the prediction score 
# from a different recommender. The target variable will be the actual rating.
# c_b_f - content-based filtering prediction
# c_f - collaborative filtering prediction
# rating - target
res_content_based = pd.concat([X_test[['userId','movieId']], pd.DataFrame(predict), y_test], axis=1)
res_content_based.rename(columns={"userId": "uid", "movieId": "iid", 0: "c_b_f"}, inplace=True)
res_content_based = res_content_based.merge(collaborative_filtering_predictions[['uid', 'iid', 'c_f']], on=['uid','iid'])
res_content_based.head()

Unnamed: 0,uid,iid,c_b_f,rating,c_f
0,234,1214,3.630681,5.0,3.984707
1,602,204,3.281444,4.0,2.66107
2,45,33495,3.293446,3.0,3.805262
3,599,7030,3.545009,2.5,2.626276
4,331,56782,3.713856,5.0,3.971165


In [18]:
# 7. Train a regression model if you're predicting ratings
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor # KNN
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import RidgeCV

regressor = StackingRegressor(
    [
        ('LinearRegression', LinearRegression()),
        ('KNeighborsRegressor', KNeighborsRegressor()),
        ('DecisionTree', DecisionTreeRegressor())
    ], RidgeCV())

regressor.fit(res_content_based[['c_f', 'c_b_f']], res_content_based['rating'])
#regressor.score(X_train, y_train)

In [19]:
# 8. Get a new rmse

# set params (not use so far)
i = 10
userId = ratings_for_final_test['userId'].iloc[i]
movieId = ratings_for_final_test['movieId'].iloc[i]

# 0. set dataframe user-movies and real ratings also
predictions_features_matrix = ratings_for_final_test[['userId', 'movieId', 'rating']]


# 1. collaborative filtering predictions
c_f = []
for index, row in predictions_features_matrix.iterrows():
    c_f.append(collaborative_filtering_model.predict(uid=row['userId'], iid=row['movieId']).est)


# 2. content based
X_test = ct.transform(ratings_for_final_test[text_fetures])
X_test = tfidf_vector.transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=ct.get_feature_names_out()).join(ratings_for_final_test[num_features].reset_index())

c_b_predict = bagging.predict(X_test.drop(columns=['rating']))

# 3. merge everything in one dataframe
predictions_features_matrix['c_f'] = c_f
predictions_features_matrix['c_b_f'] = c_b_predict

# 4. final regressions model 
predictions_features_matrix['final_predict_rating'] = regressor.predict(predictions_features_matrix[['c_f', 'c_b_f']])

# 5. get rmse
rmse = mean_squared_error(predictions_features_matrix['rating'] , predictions_features_matrix['final_predict_rating'], squared=False)
rmse

0.8781910353852251