# Exploration des Donn√©es - MovieLens 20M

**Auteur:** Dady Akrou Cyrille  
**Email:** cyrilledady0501@gmail.com  
**Date:** D√©cembre 2024

Ce notebook explore le dataset MovieLens 20M pour comprendre la structure des donn√©es et identifier les patterns pour notre syst√®me de recommandation.

In [None]:
# Imports n√©cessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
import yaml
from datetime import datetime

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

# Configuration des graphiques
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [None]:
# Chargement de la configuration
config_path = Path('../config/config.yaml')

with open(config_path, 'r', encoding='utf-8') as file:
    config = yaml.safe_load(file)

# Chemins des donn√©es
data_path = Path(config['data']['base_path'])
dataset_files = config['data']['dataset_files']

print("Configuration charg√©e avec succ√®s!")
print(f"Chemin des donn√©es: {data_path}")

## 1. Chargement des Donn√©es

In [None]:
# Fonction pour charger les donn√©es
def load_data():
    """Charge tous les fichiers du dataset MovieLens"""
    data = {}
    
    try:
        # Chargement des ratings
        print("Chargement des ratings...")
        data['ratings'] = pd.read_csv(data_path / dataset_files['ratings'])
        print(f"‚úì Ratings: {data['ratings'].shape}")
        
        # Chargement des films
        print("Chargement des films...")
        data['movies'] = pd.read_csv(data_path / dataset_files['movies'])
        print(f"‚úì Films: {data['movies'].shape}")
        
        # Chargement des tags
        print("Chargement des tags...")
        data['tags'] = pd.read_csv(data_path / dataset_files['tags'])
        print(f"‚úì Tags: {data['tags'].shape}")
        
        # Chargement des liens
        print("Chargement des liens...")
        data['links'] = pd.read_csv(data_path / dataset_files['links'])
        print(f"‚úì Liens: {data['links'].shape}")
        
        # Chargement des genome scores (optionnel)
        try:
            print("Chargement des genome scores...")
            data['genome_scores'] = pd.read_csv(data_path / dataset_files['genome_scores'])
            print(f"‚úì Genome Scores: {data['genome_scores'].shape}")
        except FileNotFoundError:
            print("‚ö†Ô∏è Genome scores non trouv√©")
        
        # Chargement des genome tags (optionnel)
        try:
            print("Chargement des genome tags...")
            data['genome_tags'] = pd.read_csv(data_path / dataset_files['genome_tags'])
            print(f"‚úì Genome Tags: {data['genome_tags'].shape}")
        except FileNotFoundError:
            print("‚ö†Ô∏è Genome tags non trouv√©")
            
    except Exception as e:
        print(f"‚ùå Erreur lors du chargement: {e}")
        return None
    
    return data

# Chargement des donn√©es
data = load_data()

## 2. Exploration des Ratings

In [None]:
if data and 'ratings' in data:
    ratings = data['ratings']
    
    print("=== ANALYSE DES RATINGS ===")    print(f"Nombre total de ratings: {len(ratings):,}")    print(f"Nombre d'utilisateurs uniques: {ratings['userId'].nunique():,}")    print(f"P√©riode: {pd.to_datetime(ratings['timestamp'], unit='s').min()} √† {pd.to_datetime(ratings['timestamp'], unit='s').max()}")    
    # Statistiques des ratings
    print("\n=== STATISTIQUES DES RATINGS ===")    print(ratings['rating'].describe())
    
    # Affichage des premi√®res lignes
    print("\n=== APER√áU DES DONN√âES ===")display(ratings.head(10))

# Informations sur les types de donn√©es
print("\n=== INFORMATIONS SUR LES DONN√âES ===")print(ratings.info())

In [None]:
# Distribution des ratings
if data and 'ratings' in data:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Distribution des ratings
    ratings['rating'].hist(bins=10, ax=axes[0,0], edgecolor='black')
    axes[0,0].set_title('Distribution des Ratings')
    axes[0,0].set_xlabel('Rating')
    axes[0,0].set_ylabel('Fr√©quence')
    
    # Nombre de ratings par utilisateur
    user_counts = ratings['userId'].value_counts()
    user_counts.hist(bins=50, ax=axes[0,1], edgecolor='black')
    axes[0,1].set_title('Distribution du Nombre de Ratings par Utilisateur')
    axes[0,1].set_xlabel('Nombre de Ratings')
    axes[0,1].set_ylabel('Nombre d\'Utilisateurs')
    axes[0,1].set_xscale('log')
    axes[0,1].set_yscale('log')
    
    # Nombre de ratings par film
    movie_counts = ratings['movieId'].value_counts()
    movie_counts.hist(bins=50, ax=axes[1,0], edgecolor='black')
    axes[1,0].set_title('Distribution du Nombre de Ratings par Film')
    axes[1,0].set_xlabel('Nombre de Ratings')
    axes[1,0].set_ylabel('Nombre de Films')
    axes[1,0].set_xscale('log')
    axes[1,0].set_yscale('log')
    
    # √âvolution temporelle des ratings
    ratings['date'] = pd.to_datetime(ratings['timestamp'], unit='s')
    ratings['year'] = ratings['date'].dt.year
    yearly_counts = ratings['year'].value_counts().sort_index()
    yearly_counts.plot(ax=axes[1,1])
    axes[1,1].set_title('√âvolution du Nombre de Ratings par Ann√©e')
    axes[1,1].set_xlabel('Ann√©e')
    axes[1,1].set_ylabel('Nombre de Ratings')
    
    plt.tight_layout()
    plt.show()

## 3. Exploration des Films

In [None]:
if data and 'movies' in data:
    movies = data['movies']
    
    print("=== ANALYSE DES FILMS ===)
    print(f"Nombre total de films: {len(movies):,}")
    
    # Extraction de l'ann√©e depuis le titre
    movies['year'] = movies['title'].str.extract(r'\((\d{4})\)$')[0]
    movies['year'] = pd.to_numeric(movies['year'], errors='coerce')
    
    # Nettoyage du titre
    movies['clean_title'] = movies['title'].str.replace(r'\s*\(\d{4}\)$', '', regex=True)
    
    print(f"P√©riode des films: {movies['year'].min():.0f} √† {movies['year'].max():.0f}")
    
    # Affichage des premi√®res lignes
    print("\n=== APER√áU DES FILMS ===)
    display(movies.head(10))
    
    # Analyse des genres
    print("\n=== ANALYSE DES GENRES ===)
    # S√©paration des genres
    all_genres = []
    for genres in movies['genres'].dropna():
        all_genres.extend(genres.split('|'))
    
    genre_counts = pd.Series(all_genres).value_counts()
    print(f"Nombre de genres uniques: {len(genre_counts)}")
    print("\nTop 10 des genres:")
    print(genre_counts.head(10))

In [None]:
# Visualisations des films
if data and 'movies' in data:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Distribution des ann√©es
    movies['year'].hist(bins=30, ax=axes[0,0], edgecolor='black')
    axes[0,0].set_title('Distribution des Films par Ann√©e')
    axes[0,0].set_xlabel('Ann√©e')
    axes[0,0].set_ylabel('Nombre de Films')
    
    # Top genres
    genre_counts.head(15).plot(kind='barh', ax=axes[0,1])
    axes[0,1].set_title('Top 15 des Genres')
    axes[0,1].set_xlabel('Nombre de Films')
    
    # √âvolution des genres dans le temps
    top_genres = genre_counts.head(5).index
    for genre in top_genres:
        genre_by_year = []
        years = sorted(movies['year'].dropna().unique())
        for year in years:
            year_movies = movies[movies['year'] == year]
            genre_count = year_movies['genres'].str.contains(genre, na=False).sum()
            genre_by_year.append(genre_count)
        axes[1,0].plot(years, genre_by_year, label=genre, marker='o', markersize=2)
    
    axes[1,0].set_title('√âvolution des Top 5 Genres dans le Temps')
    axes[1,0].set_xlabel('Ann√©e')
    axes[1,0].set_ylabel('Nombre de Films')
    axes[1,0].legend()
    
    # Nombre de genres par film
    movies['genre_count'] = movies['genres'].str.count('\|') + 1
    movies['genre_count'].hist(bins=10, ax=axes[1,1], edgecolor='black')
    axes[1,1].set_title('Distribution du Nombre de Genres par Film')
    axes[1,1].set_xlabel('Nombre de Genres')
    axes[1,1].set_ylabel('Nombre de Films')
    
    plt.tight_layout()
    plt.show()

## 4. Analyse Crois√©e Ratings-Films

In [None]:
# Fusion des donn√©es
if data and 'ratings' in data and 'movies' in data:
    # Fusion ratings et movies
    ratings_movies = ratings.merge(movies, on='movieId', how='left')
    
    print("=== ANALYSE CROIS√âE ===)
    print(f"Donn√©es fusionn√©es: {len(ratings_movies):,} ratings")
    
    # Films les plus populaires
    popular_movies = ratings_movies.groupby(['movieId', 'title']).agg({
        'rating': ['count', 'mean']
    }).round(2)
    
    popular_movies.columns = ['num_ratings', 'avg_rating']
    popular_movies = popular_movies.reset_index()
    
    # Filtrer les films avec au moins 100 ratings
    popular_movies_filtered = popular_movies[popular_movies['num_ratings'] >= 100]
    
    print("\n=== TOP 10 FILMS LES PLUS POPULAIRES (>100 ratings) ===)
    top_popular = popular_movies_filtered.nlargest(10, 'num_ratings')
    display(top_popular)
    
    print("\n=== TOP 10 FILMS LES MIEUX NOT√âS (>100 ratings) ===)
    top_rated = popular_movies_filtered.nlargest(10, 'avg_rating')
    display(top_rated)

In [None]:
# Analyse par genre
if data and 'ratings' in data and 'movies' in data:
    # Analyse des ratings par genre
    genre_ratings = []
    
    for idx, row in movies.iterrows():
        if pd.notna(row['genres']):
            genres = row['genres'].split('|')
            movie_ratings = ratings[ratings['movieId'] == row['movieId']]['rating']
            
            for genre in genres:
                for rating in movie_ratings:
                    genre_ratings.append({'genre': genre, 'rating': rating})
    
    genre_ratings_df = pd.DataFrame(genre_ratings)
    
    # Statistiques par genre
    genre_stats = genre_ratings_df.groupby('genre')['rating'].agg(['count', 'mean', 'std']).round(2)
    genre_stats = genre_stats.sort_values('mean', ascending=False)
    
    print("=== STATISTIQUES PAR GENRE ===)
    display(genre_stats.head(15))

In [None]:
# Visualisations avanc√©es
if data and 'ratings' in data and 'movies' in data:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Relation entre popularit√© et note moyenne
    axes[0,0].scatter(popular_movies['num_ratings'], popular_movies['avg_rating'], alpha=0.6)
    axes[0,0].set_xlabel('Nombre de Ratings')
    axes[0,0].set_ylabel('Note Moyenne')
    axes[0,0].set_title('Relation Popularit√© vs Note Moyenne')
    axes[0,0].set_xscale('log')
    
    # Distribution des notes moyennes par genre (top 10)
    top_genres_stats = genre_stats.head(10)
    top_genres_stats['mean'].plot(kind='barh', ax=axes[0,1])
    axes[0,1].set_title('Note Moyenne par Genre (Top 10)')
    axes[0,1].set_xlabel('Note Moyenne')
    
    # √âvolution des ratings dans le temps
    monthly_ratings = ratings_movies.groupby(ratings_movies['date'].dt.to_period('M'))['rating'].mean()
    monthly_ratings.plot(ax=axes[1,0])
    axes[1,0].set_title('√âvolution de la Note Moyenne dans le Temps')
    axes[1,0].set_xlabel('Date')
    axes[1,0].set_ylabel('Note Moyenne')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Heatmap des ratings par heure et jour de la semaine
    ratings_movies['hour'] = ratings_movies['date'].dt.hour
    ratings_movies['day_of_week'] = ratings_movies['date'].dt.day_name()
    
    # Cr√©er une heatmap simplifi√©e
    hour_day_counts = ratings_movies.groupby(['day_of_week', 'hour']).size().unstack(fill_value=0)
    
    # R√©organiser les jours de la semaine
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    hour_day_counts = hour_day_counts.reindex(day_order)
    
    im = axes[1,1].imshow(hour_day_counts.values, cmap='YlOrRd', aspect='auto')
    axes[1,1].set_xticks(range(0, 24, 4))
    axes[1,1].set_xticklabels(range(0, 24, 4))
    axes[1,1].set_yticks(range(len(day_order)))
    axes[1,1].set_yticklabels(day_order)
    axes[1,1].set_title('Activit√© des Ratings par Heure et Jour')
    axes[1,1].set_xlabel('Heure')
    axes[1,1].set_ylabel('Jour de la Semaine')
    
    plt.tight_layout()
    plt.show()

## 5. Analyse de la Sparsit√©

In [None]:
# Analyse de la sparsit√© de la matrice utilisateur-film
if data and 'ratings' in data:
    n_users = ratings['userId'].nunique()
    n_movies = ratings['movieId'].nunique()
    n_ratings = len(ratings)
    
    # Calcul de la sparsit√©
    total_possible_ratings = n_users * n_movies
    sparsity = (1 - (n_ratings / total_possible_ratings)) * 100
    
    print("=== ANALYSE DE LA SPARSIT√â ===)
    print(f"Nombre d'utilisateurs: {n_users:,}")
    print(f"Nombre de films: {n_movies:,}")
    print(f"Nombre de ratings: {n_ratings:,}")
    print(f"Ratings possibles: {total_possible_ratings:,}")
    print(f"Sparsit√©: {sparsity:.2f}%")
    print(f"Densit√©: {100-sparsity:.2f}%")
    
    # Distribution des utilisateurs actifs
    user_activity = ratings['userId'].value_counts()
    
    print("\n=== ACTIVIT√â DES UTILISATEURS ==="),
    print(f"Utilisateur le plus actif: {user_activity.max()} ratings")
    print(f"Utilisateur le moins actif: {user_activity.min()} ratings")
    print(f"Moyenne de ratings par utilisateur: {user_activity.mean():.1f}")
    print(f"M√©diane de ratings par utilisateur: {user_activity.median():.1f}")
    
    # Distribution des films populaires
    movie_popularity = ratings['movieId'].value_counts()
    
    print("\n=== POPULARIT√â DES FILMS ===)
    print(f"Film le plus populaire: {movie_popularity.max()} ratings")
    print(f"Film le moins populaire: {movie_popularity.min()} ratings")
    print(f"Moyenne de ratings par film: {movie_popularity.mean():.1f}")
    print(f"M√©diane de ratings par film: {movie_popularity.median():.1f}")

## 6. Recommandations pour le Syst√®me

In [None]:
# Recommandations bas√©es sur l'analyse
print("=== RECOMMANDATIONS POUR LE SYST√àME ===)
print()

print("1. üìä DONN√âES:")
if data and 'ratings' in data:
    print(f"   ‚Ä¢ Dataset volumineux: {len(ratings):,} ratings")
    print(f"   ‚Ä¢ Sparsit√© √©lev√©e: {sparsity:.1f}% - n√©cessite des techniques robustes")
    print(f"   ‚Ä¢ P√©riode: {ratings_movies['year'].min():.0f}-{ratings_movies['year'].max():.0f} - donn√©es historiques riches")

print("\n2. üéØ APPROCHES RECOMMAND√âES:")
print("   ‚Ä¢ Filtrage collaboratif: Exploiter les similarit√©s utilisateur-film")
print("   ‚Ä¢ Filtrage par contenu: Utiliser les genres et m√©tadonn√©es")
print("   ‚Ä¢ Syst√®me hybride: Combiner les deux approches")
print("   ‚Ä¢ Factorisation matricielle: SVD, NMF pour g√©rer la sparsit√©")

print("\n3. ‚ö†Ô∏è D√âFIS IDENTIFI√âS:")
print("   ‚Ä¢ Cold start: Nouveaux utilisateurs/films")
print("   ‚Ä¢ Sparsit√©: Peu d'interactions par utilisateur")
print("   ‚Ä¢ Scalabilit√©: Volume important de donn√©es")
print("   ‚Ä¢ Biais temporel: √âvolution des pr√©f√©rences")

print("\n4. üõ†Ô∏è TECHNIQUES √Ä IMPL√âMENTER:")
print("   ‚Ä¢ SVD (Singular Value Decomposition)")
print("   ‚Ä¢ NMF (Non-negative Matrix Factorization)")
print("   ‚Ä¢ KNN (K-Nearest Neighbors)")
print("   ‚Ä¢ TF-IDF pour le contenu textuel")
print("   ‚Ä¢ Similarit√© cosinus")

print("\n5. üìà M√âTRIQUES D'√âVALUATION:")
print("   ‚Ä¢ RMSE, MAE pour la pr√©cision")
print("   ‚Ä¢ Precision@K, Recall@K pour le ranking")
print("   ‚Ä¢ NDCG pour la qualit√© du classement")
print("   ‚Ä¢ Coverage, Diversity pour la vari√©t√©")

print("\n6. üîß OPTIMISATIONS:")
print("   ‚Ä¢ Filtrage des utilisateurs/films peu actifs")
print("   ‚Ä¢ Normalisation des ratings")
print("   ‚Ä¢ Validation temporelle")
print("   ‚Ä¢ Cache pour les recommandations fr√©quentes")

## 7. Sauvegarde des Insights

In [None]:
# Sauvegarde des statistiques importantes
insights = {
    'dataset_stats': {
        'total_ratings': len(ratings) if data and 'ratings' in data else 0,
        'unique_users': ratings['userId'].nunique() if data and 'ratings' in data else 0,
        'unique_movies': ratings['movieId'].nunique() if data and 'ratings' in data else 0,
        'sparsity_percent': sparsity if data and 'ratings' in data else 0,
        'rating_range': [ratings['rating'].min(), ratings['rating'].max()] if data and 'ratings' in data else [0, 0],
        'time_range': [str(ratings['date'].min()), str(ratings['date'].max())] if data and 'ratings' in data else ['', '']
    },
    'recommendations': {
        'filtering_thresholds': {
            'min_user_ratings': 20,
            'min_movie_ratings': 10
        },
        'models_to_implement': ['SVD', 'NMF', 'KNN', 'Content-based', 'Hybrid'],
        'evaluation_metrics': ['RMSE', 'MAE', 'Precision@K', 'Recall@K', 'NDCG@K']
    },
    'analysis_date': str(datetime.now())
}

# Sauvegarde
import json

with open('../data/processed/data_insights.json', 'w', encoding='utf-8') as f:
    json.dump(insights, f, indent=2, ensure_ascii=False)

print("‚úÖ Insights sauvegard√©s dans data/processed/data_insights.json")
print("\nüìã R√âSUM√â DE L'EXPLORATION:")
print(f"   ‚Ä¢ {insights['dataset_stats']['total_ratings']:,} ratings analys√©s")
print(f"   ‚Ä¢ {insights['dataset_stats']['unique_users']:,} utilisateurs uniques")
print(f"   ‚Ä¢ {insights['dataset_stats']['unique_movies']:,} films uniques")
print(f"   ‚Ä¢ Sparsit√©: {insights['dataset_stats']['sparsity_percent']:.1f}%")
print("   ‚Ä¢ Pr√™t pour la phase de mod√©lisation! üöÄ")