In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the dataset
df = pd.read_csv('cousedataset.csv')

print("=" * 80)
print("1. EXPLORATORY DATA ANALYSIS")
print("=" * 80)

print("\nDataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData Types:")
print(df.dtypes)
print("\nBasic Statistics:")
print(df.describe())

# ============================================================================
# 2. MISSING VALUES ANALYSIS
# ============================================================================
print("\n" + "=" * 80)
print("2. MISSING VALUES ANALYSIS")
print("=" * 80)

missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
print("\nMissing Values:")
print(missing_data if len(missing_data) > 0 else "No missing values found")

# ============================================================================
# 3. OUTLIERS DETECTION & REMOVAL
# ============================================================================
print("\n" + "=" * 80)
print("3. OUTLIERS DETECTION & HANDLING")
print("=" * 80)

def detect_outliers_iqr(data, column):
    """Detect outliers using IQR method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]

# Numeric columns to check for outliers
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

outliers_summary = {}
for col in numeric_cols:
    outliers = detect_outliers_iqr(df, col)
    if len(outliers) > 0:
        outliers_summary[col] = len(outliers)
        print(f"\n{col}: {len(outliers)} outliers detected ({len(outliers)/len(df)*100:.2f}%)")

# Remove extreme outliers (keep rows where all numeric values are within reasonable bounds)
df_clean = df.copy()

# For rating column: if exists, cap between 0-5
if 'rating' in df_clean.columns:
    df_clean['rating'] = df_clean['rating'].clip(0, 5)

# For subscriber count, reviews, etc.: remove extreme outliers
cols_to_check = ['num_reviews', 'num_subscribers', 'num_published_lectures']
for col in cols_to_check:
    if col in df_clean.columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 3 * IQR  # Use 3*IQR for removal (more conservative)
        upper_bound = Q3 + 3 * IQR
        initial_count = len(df_clean)
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        print(f"Removed {initial_count - len(df_clean)} rows for {col}")

print(f"\nDataset shape after outlier removal: {df_clean.shape}")

# ============================================================================
# 4. FEATURE ENGINEERING
# ============================================================================
print("\n" + "=" * 80)
print("4. FEATURE ENGINEERING")
print("=" * 80)

# Convert datetime columns if they exist
date_cols = ['created', 'published_time']
for col in date_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Extract temporal features from created date
if 'created' in df_clean.columns:
    # Handle timezone-aware/naive datetime conversion
    now = pd.Timestamp.now(tz='UTC')
    if df_clean['created'].dt.tz is None:
        # If naive, use naive now
        now = datetime.now()
    df_clean['course_age_days'] = (now - df_clean['created']).dt.days
    df_clean['course_year'] = df_clean['created'].dt.year
    df_clean['course_month'] = df_clean['created'].dt.month
    print("\n‚úì Created temporal features: course_age_days, course_year, course_month")

# Engagement metrics
if 'num_reviews' in df_clean.columns and 'num_subscribers' in df_clean.columns:
    df_clean['review_rate'] = df_clean['num_reviews'] / (df_clean['num_subscribers'] + 1)
    df_clean['review_rate'] = df_clean['review_rate'].clip(0, 1)  # Cap at 100%
    print("‚úì Created: review_rate (reviews per subscriber)")

# Rating-based features
if 'rating' in df_clean.columns:
    df_clean['is_highly_rated'] = (df_clean['rating'] >= 4.5).astype(int)
    df_clean['rating_category'] = pd.cut(df_clean['rating'], 
                                         bins=[0, 3, 4, 4.5, 5], 
                                         labels=['Poor', 'Average', 'Good', 'Excellent'])
    print("‚úì Created: is_highly_rated, rating_category")

# Course popularity score (composite metric)
if 'num_subscribers' in df_clean.columns:
    scaler = MinMaxScaler()
    df_clean['popularity_score'] = scaler.fit_transform(
        df_clean[['num_subscribers']]
    )
    print("‚úì Created: popularity_score (normalized)")

# Price-related features
if 'discount_price_amount' in df_clean.columns:
    df_clean['has_discount'] = (df_clean['discount_price_amount'] > 0).astype(int)
    print("‚úì Created: has_discount")

# Content features
if 'num_published_lectures' in df_clean.columns:
    df_clean['content_density'] = df_clean['num_published_lectures'] / (df_clean['course_age_days'] + 1)
    print("‚úì Created: content_density (lectures per day)")

# Wishlist engagement
if 'is_wishlisted' in df_clean.columns:
    df_clean['wishlist_flag'] = df_clean['is_wishlisted'].astype(int)
    print("‚úì Created: wishlist_flag")

# ============================================================================
# 5. DATA VALIDATION & SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("5. FINAL DATA VALIDATION")
print("=" * 80)

print(f"\nFinal dataset shape: {df_clean.shape}")
print(f"Rows removed: {len(df) - len(df_clean)}")
print(f"\nNew features created:")
new_cols = set(df_clean.columns) - set(df.columns)
for col in sorted(new_cols):
    print(f"  - {col}")

# Check for any remaining issues
print(f"\nRemaining missing values: {df_clean.isnull().sum().sum()}")
print(f"Duplicate rows: {df_clean.duplicated().sum()}")

# Display cleaned dataset info
print("\nFinal dataset info:")
print(df_clean.dtypes)

# ============================================================================
# 6. SAVE CLEANED DATASET
# ============================================================================
output_file = 'cousedata_cleaned.csv'
df_clean.to_csv(output_file, index=False)
print(f"\n‚úì Cleaned dataset saved to: {output_file}")

# Display sample of engineered features
print("\nSample of engineered features:")
feature_cols = list(new_cols)
if feature_cols:
    print(df_clean[feature_cols].head(10))

1. EXPLORATORY DATA ANALYSIS

Dataset Shape: (13608, 20)

First few rows:
        id                                              title  \
0   762616  The Complete SQL Bootcamp 2020: Go from Zero t...   
1   937678  Tableau 2020 A-Z: Hands-On Tableau Training fo...   
2  1361790             PMP Exam Prep Seminar -  PMBOK Guide 6   
3   648826         The Complete Financial Analyst Course 2020   
4   637930  An Entire MBA in 1 Course:Award Winning Busine...   

                                                 url  is_paid  \
0                 /course/the-complete-sql-bootcamp/     True   
1                                 /course/tableau10/     True   
2                        /course/pmp-pmbok6-35-pdus/     True   
3     /course/the-complete-financial-analyst-course/     True   
4  /course/an-entire-mba-in-1-courseaward-winning...     True   

   num_subscribers  avg_rating  avg_rating_recent   rating  num_reviews  \
0           295509     4.66019            4.67874  4.67874        780

In [1]:
"""
D√âVELOPPEMENT MOD√àLE NLP COMPLET
================================
- Impl√©mentation TF-IDF vectorization
- Similarit√© cosinus et autres m√©triques
- Optimisation des param√®tres
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import time
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# SECTION 1: PR√âPARATION DES DONN√âES
# ============================================================================
print("=" * 90)
print("SECTION 1: PR√âPARATION DES DONN√âES")
print("=" * 90)

df = pd.read_csv('data_preprocessing.csv')
print(f"\n‚úì Dataset charg√©: {df.shape[0]} cours, {df.shape[1]} colonnes")

# Pr√©paration du texte
df['combined_text'] = df['title'].fillna('') + ' ' + df['url'].fillna('')
df = df[df['combined_text'].str.strip() != '']

print(f"‚úì Texte pr√©par√© pour {len(df)} cours")
print(f"\nExemples de texte combin√©:")
for i in range(3):
    print(f"   {i+1}. {df['combined_text'].iloc[i][:70]}...")

# ============================================================================
# SECTION 2: OPTIMISATION DES PARAM√àTRES TF-IDF
# ============================================================================
print("\n" + "=" * 90)
print("SECTION 2: OPTIMISATION DES PARAM√àTRES TF-IDF")
print("=" * 90)

# Grille de param√®tres √† tester
param_grid = {
    'max_features': [500, 1000, 2000],
    'max_df': [0.85, 0.90, 0.95],
    'min_df': [1, 2, 3],
    'ngram_range': [(1, 1), (1, 2), (2, 2)],
    'sublinear_tf': [True, False]
}

print("\nTesting des combinaisons de param√®tres...")
print(f"Total de combinaisons √† tester: {3 * 3 * 3 * 3 * 2} = 162")

results = []
best_score = -np.inf
best_vectorizer = None
best_params = {}

count = 0
start_time = time.time()

for max_features in param_grid['max_features']:
    for max_df in param_grid['max_df']:
        for min_df in param_grid['min_df']:
            for ngram in param_grid['ngram_range']:
                for sublinear in param_grid['sublinear_tf']:
                    count += 1
                    
                    try:
                        # Cr√©ation du vectorizer
                        vectorizer = TfidfVectorizer(
                            max_features=max_features,
                            max_df=max_df,
                            min_df=min_df,
                            ngram_range=ngram,
                            sublinear_tf=sublinear,
                            stop_words='english',
                            lowercase=True,
                            strip_accents='unicode'
                        )
                        
                        # Transformation
                        tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
                        
                        # Calcul des m√©triques d'√©valuation
                        sparsity = 1 - (tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]))
                        feature_density = tfidf_matrix.nnz / tfidf_matrix.shape[0]
                        avg_nonzero_per_doc = tfidf_matrix.nnz / tfidf_matrix.shape[0]
                        
                        # Score composite (√† maximiser)
                        score = feature_density * (1 - sparsity) / (1 + sparsity * 0.1)
                        
                        results.append({
                            'max_features': max_features,
                            'max_df': max_df,
                            'min_df': min_df,
                            'ngram': ngram,
                            'sublinear_tf': sublinear,
                            'n_features': tfidf_matrix.shape[1],
                            'sparsity': sparsity,
                            'feature_density': feature_density,
                            'avg_nonzero': avg_nonzero_per_doc,
                            'score': score
                        })
                        
                        # Mise √† jour du meilleur mod√®le
                        if score > best_score:
                            best_score = score
                            best_vectorizer = vectorizer
                            best_params = {
                                'max_features': max_features,
                                'max_df': max_df,
                                'min_df': min_df,
                                'ngram_range': ngram,
                                'sublinear_tf': sublinear
                            }
                    except Exception as e:
                        continue

elapsed_time = time.time() - start_time

results_df = pd.DataFrame(results).sort_values('score', ascending=False)

print(f"\n‚úì {count} combinaisons test√©es en {elapsed_time:.2f} secondes")
print(f"\nüìä TOP 10 CONFIGURATIONS:")
print(results_df[['max_features', 'max_df', 'min_df', 'ngram', 'sublinear_tf', 
                  'n_features', 'sparsity', 'score']].head(10).to_string(index=False))

print(f"\nüèÜ MEILLEURS PARAM√àTRES:")
for key, value in best_params.items():
    print(f"   ‚Ä¢ {key}: {value}")
print(f"   ‚Ä¢ Score d'optimisation: {best_score:.6f}")

# ============================================================================
# SECTION 3: CONSTRUCTION DU MOD√àLE TF-IDF OPTIMAL
# ============================================================================
print("\n" + "=" * 90)
print("SECTION 3: CONSTRUCTION DU MOD√àLE TF-IDF OPTIMAL")
print("=" * 90)

tfidf_vectorizer = TfidfVectorizer(
    max_features=best_params['max_features'],
    max_df=best_params['max_df'],
    min_df=best_params['min_df'],
    ngram_range=best_params['ngram_range'],
    sublinear_tf=best_params['sublinear_tf'],
    stop_words='english',
    lowercase=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}'
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

print(f"\n‚úì Matrice TF-IDF cr√©√©e:")
print(f"   ‚Ä¢ Dimensions: {tfidf_matrix.shape[0]} documents √ó {tfidf_matrix.shape[1]} features")
print(f"   ‚Ä¢ Type: Sparse matrix (CSR)")
print(f"   ‚Ä¢ √âl√©ments non-z√©ro: {tfidf_matrix.nnz:,}")
print(f"   ‚Ä¢ Sparsit√©: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.2%}")
print(f"   ‚Ä¢ M√©moire estim√©e: {tfidf_matrix.data.nbytes / 1024 / 1024:.2f} MB")

# Features les plus importants
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
mean_tfidf = tfidf_matrix.toarray().mean(axis=0)
top_features_idx = np.argsort(mean_tfidf)[-20:]

print(f"\nüìù Top 20 features (par score TF-IDF moyen):")
top_features = feature_names[top_features_idx]
top_scores = mean_tfidf[top_features_idx]
for i, (feat, score) in enumerate(zip(top_features[::-1], top_scores[::-1]), 1):
    print(f"   {i:2d}. {feat:20s} ‚Üí {score:.6f}")

# ============================================================================
# SECTION 4: SIMILARIT√â COSINUS
# ============================================================================
print("\n" + "=" * 90)
print("SECTION 4: SIMILARIT√â COSINUS")
print("=" * 90)

# Calcul sur √©chantillon pour efficacit√©
sample_size = min(2000, len(df))
sample_indices = np.random.choice(len(df), sample_size, replace=False)
tfidf_sample = tfidf_matrix[sample_indices]

print(f"\nCalcul de la similarit√© cosinus pour {sample_size} cours...")
cosine_sim_matrix = cosine_similarity(tfidf_sample)

# Statistiques de similarit√©
upper_triangle = cosine_sim_matrix[np.triu_indices_from(cosine_sim_matrix, k=1)]

print(f"\nüìä Statistiques de similarit√© cosinus:")
print(f"   ‚Ä¢ Moyenne: {upper_triangle.mean():.6f}")
print(f"   ‚Ä¢ M√©diane: {np.median(upper_triangle):.6f}")
print(f"   ‚Ä¢ √âcart-type: {upper_triangle.std():.6f}")
print(f"   ‚Ä¢ Min: {upper_triangle.min():.6f}")
print(f"   ‚Ä¢ Max: {upper_triangle.max():.6f}")
print(f"   ‚Ä¢ Q1 (25%): {np.percentile(upper_triangle, 25):.6f}")
print(f"   ‚Ä¢ Q3 (75%): {np.percentile(upper_triangle, 75):.6f}")
print(f"   ‚Ä¢ 95e percentile: {np.percentile(upper_triangle, 95):.6f}")
print(f"   ‚Ä¢ 99e percentile: {np.percentile(upper_triangle, 99):.6f}")

# Recommandations bas√©es sur similarit√©
print(f"\nüéØ Exemple de recommandations:")
test_idx = np.random.randint(0, len(tfidf_sample))
similarities = cosine_sim_matrix[test_idx]
top_similar_idx = np.argsort(similarities)[::-1][1:6]  # Top 5 (exclure le cours lui-m√™me)

query_course = df.iloc[sample_indices[test_idx]]
print(f"\n   Cours de requ√™te: '{query_course['title'][:60]}...'")
print(f"   Rating: {query_course['rating']:.2f} | Subscribers: {query_course['num_subscribers']:,}")

print(f"\n   Top 5 cours similaires:")
for rank, idx in enumerate(top_similar_idx, 1):
    similar_course = df.iloc[sample_indices[idx]]
    sim_score = similarities[idx]
    print(f"   {rank}. {similar_course['title'][:55]}... (Similarit√©: {sim_score:.4f})")

# ============================================================================
# SECTION 5: AUTRES M√âTRIQUES DE SIMILARIT√â
# ============================================================================
print("\n" + "=" * 90)
print("SECTION 5: AUTRES M√âTRIQUES DE SIMILARIT√â")
print("=" * 90)

print(f"\nCalcul des m√©triques additionnelles...")

# Distance Euclidienne
euclidean_dist = euclidean_distances(tfidf_sample)
euclidean_upper = euclidean_dist[np.triu_indices_from(euclidean_dist, k=1)]

# Distance Manhattan
manhattan_dist = manhattan_distances(tfidf_sample)
manhattan_upper = manhattan_dist[np.triu_indices_from(manhattan_dist, k=1)]

# Jaccard Similarity
def compute_jaccard(matrix):
    """Calcul de la similarit√© Jaccard"""
    binary_matrix = (matrix > 0).astype(int)
    intersection = binary_matrix.dot(binary_matrix.T).toarray()
    union = (binary_matrix.sum(axis=1).reshape(-1, 1) + 
             binary_matrix.sum(axis=1).reshape(1, -1) - intersection)
    jaccard = intersection / (union + 1e-10)
    return jaccard

jaccard_sim = compute_jaccard(tfidf_sample)
jaccard_upper = jaccard_sim[np.triu_indices_from(jaccard_sim, k=1)]

print(f"\nüìä Comparaison des m√©triques:")
metrics_comparison = pd.DataFrame({
    'M√©trique': ['Cosine', 'Jaccard', 'Euclidean', 'Manhattan'],
    'Moyenne': [
        upper_triangle.mean(),
        jaccard_upper.mean(),
        euclidean_upper.mean(),
        manhattan_upper.mean()
    ],
    '√âcart-type': [
        upper_triangle.std(),
        jaccard_upper.std(),
        euclidean_upper.std(),
        manhattan_upper.std()
    ],
    'Min': [
        upper_triangle.min(),
        jaccard_upper.min(),
        euclidean_upper.min(),
        manhattan_upper.min()
    ],
    'Max': [
        upper_triangle.max(),
        jaccard_upper.max(),
        euclidean_upper.max(),
        manhattan_upper.max()
    ]
})

print(metrics_comparison.to_string(index=False))

# ============================================================================
# SECTION 6: TOPIC MODELING (BONUS)
# ============================================================================
print("\n" + "=" * 90)
print("SECTION 6: TOPIC MODELING AVEC LDA")
print("=" * 90)

print(f"\nEntra√Ænement du mod√®le LDA...")

count_vectorizer = CountVectorizer(
    max_features=1000,
    max_df=0.9,
    min_df=2,
    stop_words='english'
)

count_matrix = count_vectorizer.fit_transform(df['combined_text'])

lda_model = LatentDirichletAllocation(
    n_components=5,
    random_state=42,
    max_iter=15,
    learning_method='online',
    n_jobs=-1
)

lda_model.fit(count_matrix)

feature_names_lda = np.array(count_vectorizer.get_feature_names_out())

print(f"\nüìö Th√®mes d√©couverts (5 topics):")
for topic_idx, topic in enumerate(lda_model.components_):
    top_indices = topic.argsort()[-8:][::-1]
    top_words = [feature_names_lda[i] for i in top_indices]
    weights = topic[top_indices]
    
    print(f"\n   Topic {topic_idx + 1}:")
    for word, weight in zip(top_words, weights):
        print(f"      ‚Ä¢ {word:20s} (poids: {weight:.3f})")

# ============================================================================
# SECTION 7: DIMENSIONALITY REDUCTION
# ============================================================================
print("\n" + "=" * 90)
print("SECTION 7: DIMENSIONALITY REDUCTION AVEC SVD")
print("=" * 90)

print(f"\nR√©duction de dimension avec Truncated SVD...")

svd = TruncatedSVD(n_components=100, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf_matrix)

print(f"‚úì Variance expliqu√©e: {svd.explained_variance_ratio_.sum():.4f} ({svd.explained_variance_ratio_.sum()*100:.2f}%)")
print(f"‚úì Forme originale: {tfidf_matrix.shape}")
print(f"‚úì Forme r√©duite: {tfidf_reduced.shape}")

# Top components
print(f"\nüìä Contribution des 10 premiers composants:")
for i, var in enumerate(svd.explained_variance_ratio_[:10], 1):
    print(f"   PC{i:2d}: {var*100:5.2f}% cumul: {svd.explained_variance_ratio_[:i].sum()*100:6.2f}%")

# ============================================================================
# SECTION 8: VISUALISATIONS
# ============================================================================
print("\n" + "=" * 90)
print("SECTION 8: G√âN√âRATION DES VISUALISATIONS")
print("=" * 90)

fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Distribution de similarit√© cosinus
ax1 = fig.add_subplot(gs[0, 0])
ax1.hist(upper_triangle, bins=60, color='skyblue', edgecolor='black', alpha=0.7)
ax1.axvline(upper_triangle.mean(), color='red', linestyle='--', linewidth=2, label=f'Moyenne: {upper_triangle.mean():.3f}')
ax1.axvline(np.median(upper_triangle), color='green', linestyle='--', linewidth=2, label=f'M√©diane: {np.median(upper_triangle):.3f}')
ax1.set_xlabel('Score de similarit√©')
ax1.set_ylabel('Fr√©quence')
ax1.set_title('Distribution Similarit√© Cosinus')
ax1.legend()
ax1.grid(alpha=0.3)

# 2. Distribution Jaccard
ax2 = fig.add_subplot(gs[0, 1])
ax2.hist(jaccard_upper, bins=60, color='coral', edgecolor='black', alpha=0.7)
ax2.axvline(jaccard_upper.mean(), color='red', linestyle='--', linewidth=2)
ax2.set_xlabel('Score Jaccard')
ax2.set_ylabel('Fr√©quence')
ax2.set_title('Distribution Similarit√© Jaccard')
ax2.grid(alpha=0.3)

# 3. Comparaison des m√©triques (boxplot)
ax3 = fig.add_subplot(gs[0, 2])
data_to_plot = [upper_triangle, jaccard_upper, 
                euclidean_upper / euclidean_upper.max(), 
                manhattan_upper / manhattan_upper.max()]
ax3.boxplot(data_to_plot, labels=['Cosine', 'Jaccard', 'Eucl.(norm)', 'Manh.(norm)'])
ax3.set_ylabel('Score (normalis√©)')
ax3.set_title('Comparaison des M√©triques')
ax3.grid(axis='y', alpha=0.3)

# 4. Sparsit√© vs Score
ax4 = fig.add_subplot(gs[1, 0])
scatter = ax4.scatter(results_df['sparsity'], results_df['score'], 
                     c=results_df['n_features'], cmap='viridis', s=80, alpha=0.6)
ax4.set_xlabel('Sparsit√©')
ax4.set_ylabel('Score d\'optimisation')
ax4.set_title('Sparsit√© vs Score')
plt.colorbar(scatter, ax=ax4, label='Nombre de features')
ax4.grid(alpha=0.3)

# 5. Feature Density vs Sparsity
ax5 = fig.add_subplot(gs[1, 1])
scatter = ax5.scatter(results_df['feature_density'], results_df['sparsity'], 
                     c=results_df['score'], cmap='plasma', s=80, alpha=0.6)
ax5.set_xlabel('Feature Density')
ax5.set_ylabel('Sparsit√©')
ax5.set_title('Feature Density vs Sparsit√©')
plt.colorbar(scatter, ax=ax5, label='Score')
ax5.grid(alpha=0.3)

# 6. Top Features TF-IDF
ax6 = fig.add_subplot(gs[1, 2])
top_15_idx = np.argsort(mean_tfidf)[-15:]
ax6.barh(range(len(top_15_idx)), mean_tfidf[top_15_idx], color='lightgreen')
ax6.set_yticks(range(len(top_15_idx)))
ax6.set_yticklabels(feature_names[top_15_idx])
ax6.set_xlabel('Score TF-IDF moyen')
ax6.set_title('Top 15 Features TF-IDF')
ax6.grid(axis='x', alpha=0.3)

# 7. Variance expliqu√©e SVD
ax7 = fig.add_subplot(gs[2, 0])
cumsum_var = np.cumsum(svd.explained_variance_ratio_)
ax7.plot(range(1, len(cumsum_var)+1), cumsum_var, 'b-', linewidth=2)
ax7.axhline(0.95, color='r', linestyle='--', label='95% variance')
ax7.axhline(0.90, color='orange', linestyle='--', label='90% variance')
ax7.set_xlabel('Nombre de composants')
ax7.set_ylabel('Variance cumul√©e')
ax7.set_title('Variance Expliqu√©e - SVD')
ax7.legend()
ax7.grid(alpha=0.3)

# 8. Ngram types distribution
ax8 = fig.add_subplot(gs[2, 1])
ngram_counts = results_df['ngram'].value_counts()
ax8.bar(range(len(ngram_counts)), ngram_counts.values, color='purple', alpha=0.7)
ax8.set_xticks(range(len(ngram_counts)))
ax8.set_xticklabels(ngram_counts.index)
ax8.set_ylabel('Nombre de configurations')
ax8.set_title('Distribution des n-grams test√©s')
ax8.grid(axis='y', alpha=0.3)

# 9. Heatmap similarit√© (petite matrice)
ax9 = fig.add_subplot(gs[2, 2])
sample_sim_small = cosine_sim_matrix[:30, :30]
im = ax9.imshow(sample_sim_small, cmap='hot', aspect='auto')
ax9.set_title('Matrice de similarit√© (30√ó30 cours)')
ax9.set_xlabel('Cours')
ax9.set_ylabel('Cours')
plt.colorbar(im, ax=ax9)

plt.savefig('nlp_model_complete_analysis.png', dpi=300, bbox_inches='tight')
print("\n‚úì Visualisations sauvegard√©es: nlp_model_complete_analysis.png")

# ============================================================================
# SECTION 9: R√âSUM√â FINAL
# ============================================================================
print("\n" + "=" * 90)
print("R√âSUM√â FINAL - D√âVELOPPEMENT MOD√àLE NLP")
print("=" * 90)

summary = {
    "üìä DONN√âES": {
        "Total des cours": len(df),
        "Longueur moyenne du texte": df['combined_text'].str.len().mean()
    },
    "üîß PARAM√àTRES OPTIMIS√âS": best_params,
    "üìà R√âSULTATS TF-IDF": {
        "Nombre de features": tfidf_matrix.shape[1],
        "Sparsit√©": f"{(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.2%}",
        "√âl√©ments non-z√©ro": f"{tfidf_matrix.nnz:,}"
    },
    "üìê SIMILARIT√â COSINUS": {
        "Moyenne": f"{upper_triangle.mean():.6f}",
        "√âcart-type": f"{upper_triangle.std():.6f}",
        "Min-Max": f"{upper_triangle.min():.6f} - {upper_triangle.max():.6f}"
    },
    "üéØ DIMENSIONALITY REDUCTION": {
        "Composants SVD": 100,
        "Variance expliqu√©e": f"{svd.explained_variance_ratio_.sum()*100:.2f}%"
    }
}

for section, values in summary.items():
    print(f"\n{section}")
    for key, value in values.items():
        if isinstance(value, dict):
            for k, v in value.items():
                print(f"   ‚Ä¢ {k}: {v}")
        else:
            print(f"   ‚Ä¢ {key}: {value}")

print("\n" + "=" * 90)
print("‚úÖ D√âVELOPPEMENT MOD√àLE NLP COMPLET - TERMIN√â")
print("=" * 90)

SECTION 1: PR√âPARATION DES DONN√âES

‚úì Dataset charg√©: 10995 cours, 29 colonnes
‚úì Texte pr√©par√© pour 10995 cours

Exemples de texte combin√©:
   1. Tableau Server 2019.1 Administration /course/tableauserver2019/...
   2. Fundamentals of Change Management /course/fundamentals-of-change-manag...
   3. Microsoft Advanced Excel Dashboard : Zero To Hero (Complete) /course/a...

SECTION 2: OPTIMISATION DES PARAM√àTRES TF-IDF

Testing des combinaisons de param√®tres...
Total de combinaisons √† tester: 162 = 162

‚úì 162 combinaisons test√©es en 134.77 secondes

üìä TOP 10 CONFIGURATIONS:
 max_features  max_df  min_df  ngram  sublinear_tf  n_features  sparsity    score
          500    0.85       1 (1, 2)         False         500  0.992441 0.025993
          500    0.85       1 (1, 2)          True         500  0.992441 0.025993
          500    0.90       1 (1, 2)          True         500  0.992441 0.025993
          500    0.90       1 (1, 2)         False         500  0.992441 0.

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]