# Hybrid Recommender

In [2]:
!pip install sentence-transformers



In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# Load pre-trained embedding model (light and fast)
model = SentenceTransformer('all-MiniLM-L6-v2')


In [6]:
import os 
# Load the dataset
PATH = "/Users/agathecauhape/EMLyon 2024-25/Canada/Recommender System/projet/data/"

file_path = os.path.join(PATH, "text_clean.csv")
df = pd.read_csv(file_path)

In [8]:

# Make sure 'game_title_lower' column exists for matching
df['game_title_lower'] = df['game_title'].str.lower()

In [7]:
# Handle missing reviews gracefully
def get_embedding(text):
    if pd.isna(text) or text.strip() == "":
        return np.zeros(model.get_sentence_embedding_dimension())
    return model.encode(text)

# Apply to your dataframe
df['review_embedding'] = df['user_review_text'].apply(get_embedding)


In [9]:
exclude_cols = ['game_title', 'game_title_lower', 'user_review_text', 'review_embedding']
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.difference(exclude_cols)
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, normalize
# One-hot encode categorical variables
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_cat = ohe.fit_transform(df[categorical_cols])

In [14]:
# Scale numerical variables
scaler = StandardScaler()
X_num = scaler.fit_transform(df[numerical_cols])

# Combine numeric + categorical features
X_structured = np.hstack([X_num, X_cat])


In [15]:
# 5. Prepare embedding matrix
embeddings_matrix = np.vstack(df['review_embedding'].values)

# 6. Normalize features and embeddings
scaler_struct = StandardScaler()
X_struct_scaled = scaler_struct.fit_transform(X_structured)

X_emb_scaled = normalize(embeddings_matrix)

# 7. Concatenate structured + embeddings for hybrid representation
X_hybrid = np.hstack([X_struct_scaled, X_emb_scaled])

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# 8. Compute similarity matrix
print("Computing similarity matrix ...")
similarity_matrix = cosine_similarity(X_hybrid)

Computing similarity matrix ...


In [18]:
# 9. Recommendation function using hybrid features
def recommend_games_hybrid(game_title, top_n=5):
    game_title_clean = game_title.strip().lower()
    df_reset = df.reset_index(drop=True)

    # Partial / fuzzy match on game titles
    matches = df_reset[df_reset['game_title_lower'].str.contains(game_title_clean, case=False, na=False)]

    if matches.empty:
        print(f"'{game_title}' not found in dataset.")
        return pd.DataFrame()

    idx = matches.index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommended_indices = [i[0] for i in sim_scores[1:top_n+1]]
    return df_reset.iloc[recommended_indices][['game_title', 'genre', 'platform']]

In [19]:
# 10. Example usage
print("Recommendations for 'The Witcher 3':")
recommendations = recommend_games_hybrid("The Witcher 3", top_n=5)
print(recommendations)

Recommendations for 'The Witcher 3':
                             game_title     genre platform
19588                Tomb Raider (2013)  Fighting   Mobile
19241          The Witcher 3: Wild Hunt  Fighting     Xbox
36095                       Overwatch 2  Fighting     Xbox
26632  Pillars of Eternity II: Deadfire  Fighting     Xbox
33707                         Fall Guys  Fighting     Xbox
