Partie 3 : ModÃ©lisation
========================

Objectifs
---------
- ImplÃ©menter un modÃ¨le de filtrage collaboratif (User-Based)
- ImplÃ©menter un modÃ¨le basÃ© sur le contenu (Content-Based)
- ImplÃ©menter un modÃ¨le hybride (ML supervisÃ©)
- Comparer les trois approches


In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


Chargement des DonnÃ©es
----------------------


In [22]:
DATA_PATH = "../data/"

users_df = pd.read_csv(DATA_PATH + 'users_features.csv')
products_df = pd.read_csv(DATA_PATH + 'products_features.csv')
interactions_df = pd.read_csv(DATA_PATH + 'interactions.csv')

print(f"DonnÃ©es chargÃ©es : {len(users_df)} users, {len(products_df)} products, {len(interactions_df)} interactions")


DonnÃ©es chargÃ©es : 5000 users, 1000 products, 50000 interactions


3.1 - Filtrage Collaboratif (User-Based)
------------------------------------------

Recommander les produits apprÃ©ciÃ©s par des utilisateurs similaires.

Ã‰tapes :
1. CrÃ©er une matrice utilisateur-produit
2. Calculer la similaritÃ© entre utilisateurs
3. PrÃ©dire les scores pour les produits non vus
4. Recommander le top-K produits


In [23]:
# TODO : CrÃ©er la matrice utilisateur-produit


user_item_matrix = interactions_df.pivot_table(
    index = 'user_id',
    columns = 'product_id',
    values = 'interaction_type',
    aggfunc = 'count',
    fill_value = 0
)

print(f"Matrice user-item : {user_item_matrix.shape}")
print(f"SparsitÃ© : {(user_item_matrix == 0).sum().sum() / user_item_matrix.size * 100:.2f}%")


Matrice user-item : (4974, 1000)
SparsitÃ© : 99.01%


In [24]:
# TODO : Calculer la similaritÃ© entre utilisateurs

user_similarity = cosine_similarity(user_item_matrix)

user_similarity = pd.DataFrame(
    user_similarity, 
    index = user_item_matrix.index, 
    columns = user_item_matrix.index
)

print(f"Matrice de similaritÃ© : {user_similarity.shape}")
print("SimilaritÃ© calculÃ©e")


Matrice de similaritÃ© : (4974, 4974)
SimilaritÃ© calculÃ©e


In [32]:
# TODO : Fonction de recommandation

def recommend_user_based(user_id, user_item_matrix, user_similarity, k_neighbors=10, n_recommendations=5):
    """
    Recommande des produits basÃ©s sur le filtrage collaboratif
    
    Args:
        user_id: ID de l'utilisateur
        user_item_matrix: Matrice utilisateur-produit
        user_similarity: Matrice de similaritÃ©
        k_neighbors: Nombre de voisins Ã  considÃ©rer
        n_recommendations: Nombre de recommandations
    
    Returns:
        Liste des product_ids recommandÃ©s
    """
    if user_id not in user_item_matrix.index:
        return []

    sim_scores = user_similarity.loc[user_id]

    sim_scores = sim_scores.drop(user_id)
    top_users = sim_scores.sort_values(ascending=False).head(k_neighbors).index

    neighbor_items = user_item_matrix.loc[top_users]

    weighted_scores = neighbor_items.T.dot(sim_scores[top_users])

    user_items = user_item_matrix.loc[user_id]
    weighted_scores = weighted_scores[user_items == 0]

    recommended_products = weighted_scores.sort_values(ascending=False).head(n_recommendations).index.tolist()

    return recommended_products

# Test
test_user = users_df['user_id'].iloc[0]
recommendations_cf = recommend_user_based(test_user, user_item_matrix, user_similarity)
print(f"Recommandations pour l'utilisateur {test_user} :")
print(recommendations_cf)


Recommandations pour l'utilisateur 1 :
[185, 478, 298, 669, 480]


3.2 - Filtrage par Contenu (Content-Based)
--------------------------------------------

Recommander des produits similaires Ã  ceux dÃ©jÃ  apprÃ©ciÃ©s.

Ã‰tapes :
1. CrÃ©er des embeddings de produits (TF-IDF)
2. Calculer la similaritÃ© entre produits
3. Recommander des produits similaires


In [33]:
# TODO : CrÃ©er un corpus textuel pour chaque produit

products_df['content'] = (
    products_df['name'].fillna('').astype(str)
     +
    ' '
     +
    products_df['description'].fillna('').astype(str)
     +
    ' '
     +
    products_df['category'].fillna('').astype(str)
)

tfidf = TfidfVectorizer(max_features=200, stop_words='english')
product_vectors = tfidf.fit_transform(products_df['content'])

print(f"Vecteurs TF-IDF : {product_vectors.shape}")


Vecteurs TF-IDF : (1000, 89)


In [34]:
# TODO : Calculer la similaritÃ© entre produits

product_similarity = cosine_similarity(product_vectors)

product_similarity = pd.DataFrame(
    product_similarity, 
    index=products_df['product_id'], 
    columns=products_df['product_id']
)


print(f"Matrice de similaritÃ© produits : {product_similarity.shape}")
print("SimilaritÃ© calculÃ©e")

Matrice de similaritÃ© produits : (1000, 1000)
SimilaritÃ© calculÃ©e


In [35]:
# TODO : Fonction de recommandation content-based

def recommend_content_based(user_id, interactions_df, products_df, product_similarity, n_recommendations=5):
    """
    Recommande des produits basÃ©s sur le contenu
    
    Args:
        user_id: ID de l'utilisateur
        interactions_df: DataFrame des interactions
        products_df: DataFrame des produits
        product_similarity: Matrice de similaritÃ©
        n_recommendations: Nombre de recommandations
    
    Returns:
        Liste des product_ids recommandÃ©s
    """
    if user_id not in interactions_df['user_id'].unique():
        return []

    user_products = interactions_df[interactions_df['user_id'] == user_id]['product_id'].unique()

    if len(user_products) == 0:
        top_products = products_df['product_id'].value_counts().head(n_recommendations).index.tolist()
        return top_products

    scores = product_similarity.loc[user_products].sum(axis=0)

    scores = scores.drop(user_products, errors='ignore')

    recommended_products = scores.sort_values(ascending=False).head(n_recommendations).index.tolist()

    return recommended_products

# Test
recommendations_content = recommend_content_based(test_user, interactions_df, products_df, product_similarity)
print(f"Recommandations content-based pour l'utilisateur {test_user} :")
print(recommendations_content)


Recommandations content-based pour l'utilisateur 1 :
[540, 2, 285, 341, 348]


3.3 - ModÃ¨le Hybride (ML SupervisÃ©)
-------------------------------------

Combiner features utilisateurs et produits dans un modÃ¨le supervisÃ©.

Ã‰tapes :
1. CrÃ©er un dataset d'entraÃ®nement (user + product + target)
2. EntraÃ®ner un modÃ¨le (RandomForest)
3. PrÃ©dire les probabilitÃ©s d'interaction


In [46]:
# train/test
df = interactions_df.merge(users_df, on='user_id', how='left')
df = df.merge(products_df, on='product_id', how='left')

df['registration_date'] = pd.to_datetime(df['registration_date'])
df['added_date'] = pd.to_datetime(df['added_date'])
df['interaction_date'] = pd.to_datetime(df['interaction_date'])

df = df[df['rating'].notnull()]


df['session_duration'].fillna(df['session_duration'].median(), inplace=True)
df['avg_rating'].fillna(df['avg_rating'].median(), inplace=True)
df['activity'].fillna(df['activity'].median(), inplace=True)

features = [
    'age', 'gender', 'location', 'activity_level', 'activity', 
    'avg_price', 'favorite_category', 'days_since_last',
    
    'category', 'subcategory', 'price', 'stock', 'initial_rating',
    'price_range', 'popularity', 'conversion_rate', 'avg_rating',
    
    'session_duration'
]

target = 'rating'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


print("âœ… Dataset d'entraÃ®nement crÃ©Ã© avec succÃ¨s !")
print(f"X_train : {X_train.shape}")
print(f"X_test  : {X_test.shape}")
print(f"y_train positif : {y_train.sum()} / {len(y_train)}")
print(f"y_test  positif : {y_test.sum()} / {len(y_test)}")


âœ… Dataset d'entraÃ®nement crÃ©Ã© avec succÃ¨s !
X_train : (4000, 18)
X_test  : (1001, 18)
y_train positif : 16033.0 / 4000
y_test  positif : 4058.0 / 1001


In [57]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)

if len(set(y_test)) > 2:
    roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
else:
    roc_auc = roc_auc_score(y_test, y_proba[:, 1])

print("âœ… ModÃ¨le entraÃ®nÃ© !")
print(f"Accuracy : {accuracy:.4f}")
print(f"ROC AUC  : {roc_auc:.4f}")
print("\nClassification Report :\n", classification_report(y_test, y_pred))


âœ… ModÃ¨le entraÃ®nÃ© !
Accuracy : 0.4306
ROC AUC  : 0.6345

Classification Report :
               precision    recall  f1-score   support

         1.0       0.25      0.02      0.04        52
         2.0       0.20      0.04      0.07        50
         3.0       0.13      0.05      0.08       133
         4.0       0.36      0.46      0.40       323
         5.0       0.53      0.61      0.57       443

    accuracy                           0.43      1001
   macro avg       0.29      0.24      0.23      1001
weighted avg       0.39      0.43      0.40      1001



*comparaison des modÃ¨les*

In [63]:
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)

X_test = X_test.copy()  # Ã©viter les warnings
X_test['true_label'] = y_test.values
X_test['pred_label'] = y_pred
X_test['probability'] = y_proba[:, 1] if y_proba.shape[1] > 1 else y_proba[:, 0]

test_users = X_test['user_id'].drop_duplicates().sample(10, random_state=42)

results_comparison = []

for user_id in test_users:
    rec_cf = recommend_user_based(user_id, user_item_matrix, user_similarity, n_recommendations=5)
    
    rec_content = recommend_content_based(user_id, interactions_df, products_df, product_similarity, n_recommendations=5)
    
    user_products = X_test[X_test['user_id'] == user_id]
    top_rf = (
        user_products.sort_values('probability', ascending=False)
        .head(5)['product_id']
        .tolist()
    )
    
    results_comparison.append({
        'user_id': user_id,
        'CF_recommendations': rec_cf,
        'Content_recommendations': rec_content,
        'Model_recommendations': top_rf
    })

results_df = pd.DataFrame(results_comparison)
print("ðŸ”¹ AperÃ§u des recommandations :")
print(results_df.head())

def precision_at_k(user_id, recommended, X_test, k=5):
    actual = set(X_test[(X_test['user_id'] == user_id) & (X_test['true_label'] == 1)]['product_id'])
    if len(actual) == 0:
        return np.nan
    recommended = set(recommended[:k])
    return len(actual & recommended) / k

def evaluate_models(results_df, X_test):
    precisions = {'CF': [], 'Content': [], 'Model': []}
    for _, row in results_df.iterrows():
        user_id = row['user_id']
        precisions['CF'].append(precision_at_k(user_id, row['CF_recommendations'], X_test))
        precisions['Content'].append(precision_at_k(user_id, row['Content_recommendations'], X_test))
        precisions['Model'].append(precision_at_k(user_id, row['Model_recommendations'], X_test))
    return {m: np.nanmean(v) for m, v in precisions.items()}

scores = evaluate_models(results_df, X_test)
print("\nðŸ“Š Comparaison des modÃ¨les (Precision@5) :")
for m, s in scores.items():
    print(f"{m:10s} â†’ Precision@5 = {s:.4f}")

plt.figure(figsize=(7,5))
plt.bar(scores.keys(), scores.values(), color=['#4CAF50', '#2196F3', '#FFC107'])
plt.title("Comparaison des modÃ¨les de recommandation (Precision@5)")
plt.ylabel("Score moyen")
plt.ylim(0, 1)
plt.grid(axis='y', alpha=0.3)
plt.show()

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- pred_label
- probability
- true_label


Prochaines Ã‰tapes
-----------------

Passez au notebook 04_Evaluation_starter.ipynb pour Ã©valuer et comparer les trois modÃ¨les avec des mÃ©triques adaptÃ©es.
