### sujet : Statistiques et trends prédictives sur les langages utilisés ;

In [1]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1




In [2]:
import requests
import time
import concurrent.futures
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

repos = []
per_page = 100
total_pages = 10

# Récupération des dépôts avec pagination
for page in range(1, total_pages + 1):
    url = f"https://api.github.com/search/repositories?q=stars:>10000&sort=stars&order=desc&per_page={per_page}&page={page}"
    response = requests.get(url, headers=HEADERS)

    if response.status_code == 200:
        data = response.json()
        repos.extend(data["items"])
    else:
        print(f"Erreur {response.status_code}: {response.text}")
        break

    time.sleep(1)  # Pause légère pour éviter le rate limit

# Fonction pour récupérer les langages
def get_languages(repo):
    lang_url = repo["languages_url"]
    response = requests.get(lang_url, headers=HEADERS)
    if response.status_code == 200:
        return {
            "name": repo["name"],
            "stars": repo["stargazers_count"],
            "forks": repo["forks_count"],
            "watchers": repo["watchers_count"],
            "open_issues": repo["open_issues_count"],
            "primary_language": repo["language"],
            "languages": list(response.json().keys()),
            "created_at": repo["created_at"],
            "updated_at": repo["updated_at"]
        }
    return None

# Exécution en parallèle pour récupérer les langages
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    enriched_repos = list(executor.map(get_languages, repos))

# Filtrer les résultats valides
enriched_repos = [repo for repo in enriched_repos if repo is not None]

print(f"Données enrichies récupérées : {len(enriched_repos)} dépôts")

df = pd.DataFrame(enriched_repos)
df

Données enrichies récupérées : 1000 dépôts


Unnamed: 0,name,stars,forks,watchers,open_issues,primary_language,languages,created_at,updated_at
0,freeCodeCamp,411063,39150,411063,253,TypeScript,"[TypeScript, JavaScript, CSS, Dockerfile, EJS,...",2014-12-24T17:49:19Z,2025-03-09T22:38:20Z
1,free-programming-books,352089,62944,352089,87,HTML,[HTML],2013-10-11T06:50:37Z,2025-03-09T22:44:06Z
2,build-your-own-x,351322,32580,351322,355,Markdown,[Markdown],2018-05-09T12:03:18Z,2025-03-09T22:51:09Z
3,awesome,350600,28668,350600,51,,[],2014-07-11T13:42:37Z,2025-03-09T22:46:56Z
4,public-apis,329757,34956,329757,470,Python,"[Python, Shell]",2016-03-20T23:49:42Z,2025-03-09T22:50:53Z
...,...,...,...,...,...,...,...,...,...
995,headscale,25915,1402,25915,117,Go,"[Go, HTML, Nix, Shell, Makefile]",2020-06-21T09:21:05Z,2025-03-09T22:36:14Z
996,fyne,25901,1422,25901,699,Go,"[Go, C, JavaScript, Objective-C, Java, GLSL, H...",2018-02-04T22:07:16Z,2025-03-09T22:42:59Z
997,everyone-can-use-english,25898,3848,25898,60,TypeScript,"[TypeScript, Metal, Jupyter Notebook, HTML, Ja...",2019-03-15T16:33:53Z,2025-03-09T15:59:15Z
998,iced,25889,1248,25889,358,Rust,"[Rust, WGSL, RenderScript]",2019-07-15T22:34:46Z,2025-03-09T22:08:30Z


### Kafka Producer (permet de récupérer les donnée depuis l'API Github

In [3]:
from kafka import KafkaProducer
import requests
import json
import time


GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}
KAFKA_TOPIC = "github_repos"

producer = KafkaProducer(
    bootstrap_servers="172.31.185.106:9092",
    value_serializer=lambda v: json.dumps(v).encode("utf-8")
)

def fetch_github_repos():
    repos = []
    per_page = 100
    total_pages = 10

    for page in range(1, total_pages + 1):
        url = f"https://api.github.com/search/repositories?q=stars:>10000&sort=stars&order=desc&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)

        if response.status_code == 200:
            data = response.json()
            for repo in data["items"]:
                repo_data = {
                    "id": repo["id"],
                    "name": repo["name"],
                    "stars": repo["stargazers_count"],
                    "forks": repo["forks_count"],
                    "watchers": repo["watchers_count"],
                    "open_issues": repo["open_issues_count"],
                    "primary_language": repo["language"],
                    "languages_url": repo["languages_url"],
                    "created_at": repo["created_at"],
                    "updated_at": repo["updated_at"]
                }
                producer.send(KAFKA_TOPIC, repo_data)
                print(f"Envoyé: {repo_data['name']}")

        else:
            print(f"Erreur {response.status_code}: {response.text}")
            break

        time.sleep(1)

fetch_github_repos()
producer.close()

Envoyé: freeCodeCamp
Envoyé: free-programming-books
Envoyé: build-your-own-x
Envoyé: awesome
Envoyé: public-apis
Envoyé: coding-interview-university
Envoyé: developer-roadmap
Envoyé: system-design-primer
Envoyé: 996.ICU
Envoyé: awesome-python
Envoyé: react
Envoyé: project-based-learning
Envoyé: awesome-selfhosted
Envoyé: vue
Envoyé: Python
Envoyé: javascript-algorithms
Envoyé: linux
Envoyé: tensorflow
Envoyé: You-Dont-Know-JS
Envoyé: CS-Notes
Envoyé: ohmyzsh
Envoyé: computer-science
Envoyé: AutoGPT
Envoyé: bootstrap
Envoyé: flutter
Envoyé: vscode
Envoyé: gitignore
Envoyé: Python-100-Days
Envoyé: the-book-of-secret-knowledge
Envoyé: the-art-of-command-line
Envoyé: stable-diffusion-webui
Envoyé: JavaGuide
Envoyé: javascript
Envoyé: transformers
Envoyé: awesome-go
Envoyé: youtube-dl
Envoyé: ollama
Envoyé: next.js
Envoyé: fucking-algorithm
Envoyé: go
Envoyé: Microsoft-Activation-Scripts
Envoyé: 30-seconds-of-code
Envoyé: tech-interview-handbook
Envoyé: awesome-chatgpt-prompts
Envoyé: react

### Kafka + Sauvegarde csv pyspark (teste)

In [4]:
from kafka import KafkaConsumer
import json
import os
import csv
from datetime import datetime

#Créer le dossier de sortie s'il n'existe pas
output_dir = "outputs/kafka_direct"
os.makedirs(output_dir, exist_ok=True)

#Nom du fichier de sortie avec timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"{output_dir}/github_repos_{timestamp}.csv"

#Fonction pour écrire les données dans un fichier CSV
def write_to_csv(records, filename):
    if not records:
        return
    
    #Extraire les en-têtes à partir des clés du premier enregistrement
    headers = records[0].keys()
    
    #Écrire les données dans le fichier CSV
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        writer.writerows(records)
    
    print(f"Données écrites dans {filename}")

#Configurer le consommateur Kafka
print("Connexion à Kafka...")
try:
    consumer = KafkaConsumer(
        'github_repos',
        bootstrap_servers=['172.31.185.106:9092'],
        auto_offset_reset='earliest',
        enable_auto_commit=True,
        group_id='github_repos_group',
        value_deserializer=lambda x: json.loads(x.decode('utf-8')),
        consumer_timeout_ms=60000  # Timeout après 60 secondes sans message
    )
    
    print("Consommateur Kafka connecté. En attente de messages...")
    
    # Collecter les messages
    records = []
    count = 0
    max_records = 100  # Limiter le nombre d'enregistrements pour le test
    
    for message in consumer:
        data = message.value
        records.append(data)
        count += 1
        
        # Afficher les informations sur le message
        print(f"Message reçu: {data.get('name', 'Unknown')} - Messages: {count}")
        
        # Sortir après avoir collecté un certain nombre de messages
        if count >= max_records:
            break
    
    # Écrire les résultats dans un fichier CSV
    if records:
        write_to_csv(records, output_file)
        print(f"Traitement terminé. {count} messages ont été traités.")
    else:
        print("Aucun message n'a été reçu.")
    
except Exception as e:
    print(f"Erreur lors de la connexion à Kafka: {e}")

finally:
    # Fermer le consommateur s'il a été créé
    if 'consumer' in locals():
        consumer.close()
        print("Consommateur Kafka fermé.")

Connexion à Kafka...
Consommateur Kafka connecté. En attente de messages...
Message reçu: lodash - Messages: 1
Message reçu: design-resources-for-developers - Messages: 2
Message reçu: immich - Messages: 3
Message reçu: architect-awesome - Messages: 4
Message reçu: Front-end-Developer-Interview-Questions - Messages: 5
Message reçu: markdown-here - Messages: 6
Message reçu: awesome-nodejs - Messages: 7
Message reçu: jquery - Messages: 8
Message reçu: awesome-courses - Messages: 9
Message reçu: new-pac - Messages: 10
Message reçu: annotated_deep_learning_paper_implementations - Messages: 11
Message reçu: angular.js - Messages: 12
Message reçu: shadowsocks-windows - Messages: 13
Message reçu: docusaurus - Messages: 14
Message reçu: open-interpreter - Messages: 15
Message reçu: localsend - Messages: 16
Message reçu: act - Messages: 17
Message reçu: localstack - Messages: 18
Message reçu: alacritty - Messages: 19
Message reçu: llama - Messages: 20
Message reçu: fuel-core - Messages: 21
Mess

### kafka Consumer

In [19]:
from kafka import KafkaConsumer
import json
from datetime import datetime
from pymongo import MongoClient

# Configuration MongoDB
def connect_to_mongodb():
    try:
        client = MongoClient("mongodb+srv://ufacikfatih:byJlFI7t6Lb3CFyN@cluster0.krkuu.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
        db = client["Projet_BigData"]
        collection = db["Data_Kafka"]
        
        client.admin.command('ping')
        print("Connexion à MongoDB réussie!")
        
        return client, db, collection
    except Exception as e:
        print(f"Erreur de connexion à MongoDB: {e}")
        return None, None, None

print("Connexion à Kafka...")
try:
    
    client, db, collection = connect_to_mongodb()
    
    if collection is None:
        raise Exception("Impossible de se connecter à MongoDB")
    
    delete_result = collection.delete_many({})
    print(f"Documents supprimés de MongoDB: {delete_result.deleted_count}")

    consumer = KafkaConsumer(
        'github_repos',
        bootstrap_servers=['172.31.185.106:9092'],
        auto_offset_reset='earliest',
        enable_auto_commit=True,
        group_id='github_repos_group',
        consumer_timeout_ms=60000,
        value_deserializer=lambda x: x  
    )

    print("Consommateur Kafka connecté. En attente de messages...")

    # Collecter les messages pour MongoDB
    records = []
    count = 0
    max_records = 1000  
    batch_size = 10  

    for message in consumer:
        raw_value = message.value

        if not raw_value:  # Vérifie si le message est vide
            print("Message vide reçu, passage au suivant.")
            continue

        try:
            data = json.loads(raw_value.decode('utf-8'))  # Ajout du .decode()
        except json.JSONDecodeError as e:
            print(f"Erreur de parsing JSON: {e} - Message brut: {raw_value}")
            continue

        # Ajouter un timestamp pour MongoDB
        data['imported_at'] = datetime.now().isoformat()
        records.append(data)
        count += 1

        # Insertion par lots pour optimiser les performances
        if len(records) >= batch_size:
            collection.insert_many(records)
            print(f"Lot de {len(records)} documents inséré dans MongoDB")
            records = []  

        # Sortir après avoir collecté un certain nombre de messages
        if count >= max_records:
            break

    # Insérer les documents restants
    if records:
        collection.insert_many(records)
        print(f"Dernier lot de {len(records)} documents inséré dans MongoDB")

    total_docs = collection.count_documents({})
    print(f"Traitement terminé. {count} messages ont été traités.")
    print(f"Total de documents dans MongoDB: {total_docs}")

except Exception as e:
    print(f"Erreur lors du traitement: {e}")

finally:
    if 'consumer' in locals():
        consumer.close()
        print("Consommateur Kafka fermé.")

    if 'client' in locals() and client is not None:
        client.close()
        print("Connexion MongoDB fermée.")

Connexion à Kafka...
Connexion à MongoDB réussie!
Documents supprimés de MongoDB: 1000
Consommateur Kafka connecté. En attente de messages...
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 documents inséré dans MongoDB
Lot de 10 document

# Step 2 : Proposer une solution valorisant les données recueillies, grâce à l’IA, à destination d’entreprises, d’associations ou tout autre type d’organisation ainsi que des particuliers.

### Prediction avec interface

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
from prophet import Prophet
import warnings
import os
import gradio as gr
import matplotlib
import json
matplotlib.use('Agg')  #Utiliser un backend non-interactif pour Gradio

warnings.filterwarnings('ignore')


os.makedirs('models', exist_ok=True)

#Connexion à MongoDB et récupération des données
def get_data_from_mongodb():
    try:
        client = MongoClient("mongodb+srv://ufacikfatih:byJlFI7t6Lb3CFyN@cluster0.krkuu.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
        db = client["Projet_BigData"]
        collection = db["Data_Kafka"]
        
      
        data = list(collection.find({}))
        
        client.close()
        return data, f"Récupération réussie de {len(data)} documents depuis MongoDB"
    except Exception as e:
        return [], f"Erreur lors de la récupération des données: {e}"

#Prétraitement des données
def preprocess_data(data):
    
    df = pd.DataFrame(data)
    
    if '_id' in df.columns:
        df = df.drop('_id', axis=1)

    for col in ['created_at', 'updated_at', 'imported_at']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[col] = df[col].dt.tz_localize(None)

    df = df[df['primary_language'].notna()]
    
    #Extraction des caractéristiques temporelles
    if 'created_at' in df.columns:
        df['repo_age_days'] = (datetime.now() - df['created_at']).dt.days
        df['year_created'] = df['created_at'].dt.year
        df['month_created'] = df['created_at'].dt.month

    df = df[df['repo_age_days'] >= 0]

    for col in ['stars', 'forks', 'watchers']:
        if col in df.columns:
            q99 = df[col].quantile(0.99) 
            df = df[df[col] <= q99] 
            
    if all(col in df.columns for col in ['stars', 'forks', 'watchers', 'repo_age_days']):
        df['engagement_score'] = (df['stars'] * 2 + df['forks'] * 3 + df['watchers']) / (df['repo_age_days'] + 1)

    print(f"\n Prétraitement terminé : {df.shape[0]} lignes et {df.shape[1]} colonnes restantes après nettoyage.")
    print(f" Langages uniques après filtrage : {df['primary_language'].nunique()}")
    print(f" Période couverte : {df['year_created'].min()} - {df['year_created'].max()}")
    
    return df, f"Jeu de données prétraité avec {df.shape[0]} lignes et {df.shape[1]} colonnes"

#Analyse exploratoire des données
def exploratory_analysis(df):
    results = {}
    
    # Distribution des langages principaux
    language_counts = df['primary_language'].value_counts().head(10)
    results['language_counts'] = language_counts
    
    # Statistiques d'engagement par langage
    language_stats = df.groupby('primary_language').agg({
        'stars': 'mean',
        'forks': 'mean',
        'watchers': 'mean',
        'engagement_score': 'mean',
        'id': 'count'
    }).sort_values('id', ascending=False).head(10)
    language_stats.columns = ['Étoiles (moy)', 'Forks (moy)', 'Observateurs (moy)', 'Score d\'engagement', 'Nombre de dépôts']
    results['language_stats'] = language_stats
    
    #Tendance de création de dépôts par année
    if 'year_created' in df.columns:
        yearly_trend = df.groupby(['year_created', 'primary_language']).size().unstack().fillna(0)
        results['yearly_trend'] = yearly_trend
    
    #Graphique en barres des principaux langages
    plt.figure(figsize=(10, 6))
    language_counts.plot(kind='bar', color='skyblue')
    plt.title('Top 10 des langages de programmation')
    plt.xlabel('Langage')
    plt.ylabel('Nombre de dépôts')
    plt.tight_layout()
    plt.savefig('top_languages.png')
    plt.close()
    
    #Score d'engagement par langage
    plt.figure(figsize=(10, 6))
    language_stats['Score d\'engagement'].sort_values(ascending=False).plot(kind='bar', color='lightgreen')
    plt.title('Score d\'engagement moyen par langage')
    plt.xlabel('Langage')
    plt.ylabel('Score d\'engagement')
    plt.tight_layout()
    plt.savefig('engagement_score.png')
    plt.close()
    
    #Tendance annuelle
    if 'yearly_trend' in results:
        plt.figure(figsize=(12, 7))
        yearly_trend.sum(axis=1).plot(kind='line', marker='o')
        plt.title('Création de dépôts par année')
        plt.xlabel('Année')
        plt.ylabel('Nombre de dépôts')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig('yearly_trend.png')
        plt.close()
    
    return results, "Analyse exploratoire terminée"

#Préparation des données pour le modèle de prédiction
def prepare_for_prediction(df, top_languages=10):
    # Sélection des langages les plus courants
    top_langs = df['primary_language'].value_counts().head(top_languages).index.tolist()
    df_top = df[df['primary_language'].isin(top_langs)].copy()
    
    # Création des séries temporelles pour chaque langage
    if 'year_created' in df.columns:
        time_series = df_top.groupby(['year_created', 'primary_language']).size().unstack().fillna(0)
        
        min_years_required = 4
        valid_columns = []
        
        for col in time_series.columns:
            non_zero_years = time_series[time_series[col] > 0].shape[0]
            if non_zero_years >= min_years_required:
                valid_columns.append(col)
                
        if not valid_columns:
            return None, None, top_langs, "Données temporelles insuffisantes pour la prédiction"
        
        time_series = time_series[valid_columns]
        
        #Normalisation par le nombre total de dépôts par année
        yearly_totals = time_series.sum(axis=1)
        normalized_time_series = time_series.div(yearly_totals, axis=0) * 100
        
        #Visualisation des séries temporelles
        plt.figure(figsize=(12, 7))
        for lang in valid_columns[:5]: 
            normalized_time_series[lang].plot(label=lang)
        
        plt.title('Popularité relative des principaux langages de programmation')
        plt.xlabel('Année')
        plt.ylabel('Pourcentage (%)')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig('language_trends.png')
        plt.close()
        
        return time_series, normalized_time_series, valid_columns, "Données de séries temporelles préparées pour la prédiction"
    else:
        return None, None, top_langs, "Données temporelles insuffisantes pour la prédiction"

#Modèle de prédiction Prophet pour chaque langage
def build_prophet_models(time_series, normalized_time_series, top_langs, years_to_predict=5):
    predictions = {}
    normalized_predictions = {}
    mape_scores = {}
    
    # Préparation des données pour Prophet
    last_year = time_series.index.max()
    model_results = []

    for language in top_langs:
        if language in time_series.columns:
            lang_results = {"language": language}
            
            #Préparation des données au format Prophet (ds, y)
            df_prophet = pd.DataFrame({
                'ds': pd.to_datetime(time_series.index.astype(str) + '-01-01'),
                'y': time_series[language].values
            })
            
            df_prophet = df_prophet[df_prophet['y'] > 0].reset_index(drop=True)
            
            if len(df_prophet) < 5:
                lang_results["status"] = "Données insuffisantes"
                model_results.append(lang_results)
                continue
            
            #Création et entraînement du modèle
            try:
                model = Prophet(
                    yearly_seasonality=True,
                    seasonality_mode='multiplicative',
                    changepoint_prior_scale=0.005,
                    # Paramètres supplémentaires pour améliorer la robustesse
                    changepoint_range=0.8,
                    interval_width=0.8
                )
                
                model.fit(df_prophet)
                
                # Fais les prédictions
                future = model.make_future_dataframe(periods=years_to_predict, freq='Y')
                forecast = model.predict(future)
                
                forecast['yhat'] = np.maximum(forecast['yhat'], 0)
                forecast['yhat_lower'] = np.maximum(forecast['yhat_lower'], 0)
                
                #Stockage des prédictions
                predictions[language] = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
                
                #Calcul du MAPE sur les données historiques
                historical_dates = df_prophet['ds'].values
                forecast_historical = forecast[forecast['ds'].isin(historical_dates)]
                
                comparison = pd.merge(
                    df_prophet[['ds', 'y']], 
                    forecast_historical[['ds', 'yhat']], 
                    on='ds'
                )
                
                comparison = comparison[(comparison['y'] > 0) & (comparison['yhat'] > 0)]
                
                if not comparison.empty:
                    abs_perc_errors = np.abs((comparison['y'] - comparison['yhat']) / comparison['y']) * 100
                    mape = abs_perc_errors.mean()
                    mape = min(mape, 100)  #MAPE à 100% pour une meilleure interprétation
                    mape_scores[language] = mape
                    lang_results["mape"] = f"{mape:.2f}%"
                
                #Visualisation de la prédiction pour ce langage
                fig = plt.figure(figsize=(10, 6))
                ax = fig.add_subplot(111)
                
                #Tracer les données historiques
                ax.plot(df_prophet['ds'], df_prophet['y'], 'ko', markersize=6, label='Réel')
                
                #Tracer la prédiction et l'intervalle de confiance
                ax.plot(forecast['ds'], forecast['yhat'], 'steelblue', linewidth=2, label='Prévision')
                ax.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color='steelblue', alpha=0.2)
                
                plt.title(f'Prédiction pour {language}')
                plt.xlabel('Année')
                plt.ylabel('Nombre de dépôts')
                plt.legend()
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.tight_layout()
                
                #Sauvegarder la figure pour chaque langage
                fig_path = f'prediction_{language}.png'
                plt.savefig(fig_path)
                plt.close()
                
                lang_results["status"] = "Succès"
                lang_results["figure_path"] = fig_path
                
                if language in normalized_time_series.columns:
                    df_prophet_norm = pd.DataFrame({
                        'ds': pd.to_datetime(normalized_time_series.index.astype(str) + '-01-01'),
                        'y': normalized_time_series[language].values
                    })
                    
                    df_prophet_norm = df_prophet_norm[df_prophet_norm['y'] > 0].reset_index(drop=True)
                    
                    if len(df_prophet_norm) >= 5:
                        model_norm = Prophet(
                            yearly_seasonality=True,
                            seasonality_mode='multiplicative',
                            changepoint_prior_scale=0.05,
                            changepoint_range=0.8,
                            interval_width=0.95
                        )
                        model_norm.fit(df_prophet_norm)
                        
                        future_norm = model_norm.make_future_dataframe(periods=years_to_predict, freq='Y')
                        forecast_norm = model_norm.predict(future_norm)
                        
                        forecast_norm['yhat'] = np.clip(forecast_norm['yhat'], 0, 100)
                        forecast_norm['yhat_lower'] = np.clip(forecast_norm['yhat_lower'], 0, 100)
                        forecast_norm['yhat_upper'] = np.clip(forecast_norm['yhat_upper'], 0, 100)
                        
                        normalized_predictions[language] = forecast_norm[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
            
            except Exception as e:
                lang_results["status"] = f"Erreur: {str(e)}"
            
            model_results.append(lang_results)
    
    #Section pour afficher les MAPE
    mape_summary = "\n=== Erreur MAPE pour chaque langage ===\n"
    for lang, mape in mape_scores.items():
        mape_summary += f"{lang}: {mape:.2f}%\n"
    
    #Visualisation combinée pour les principaux langages
    plt.figure(figsize=(14, 8))
    
    for i, lang in enumerate(top_langs[:5]):  #5 principaux langages
        if lang in normalized_predictions:
            pred = normalized_predictions[lang]
            plt.plot(pred['ds'], pred['yhat'], label=lang)
            plt.fill_between(pred['ds'], pred['yhat_lower'], pred['yhat_upper'], alpha=0.2)
    
    plt.title('Part de marché relative prédite des langages de programmation')
    plt.xlabel('Année')
    plt.ylabel('Pourcentage relatif (%)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('language_predictions.png')
    plt.close()
    
    return predictions, normalized_predictions, model_results, f"Modèles Prophet construits avec succès{mape_summary}"

#Générer des insights et des recommandations
def generate_insights(time_series, normalized_predictions, top_langs, feature_importance):
    insights = []
    
    #Langages en croissance vs en déclin
    if normalized_predictions:
        growth_rates = {}
        
        for lang in top_langs:
            if lang in normalized_predictions:
                pred = normalized_predictions[lang]
                
                future_values = pred[pred['ds'] > datetime.now()]
                
                if not future_values.empty and len(future_values) >= 2:
                    start_value = future_values['yhat'].iloc[0]
                    end_value = future_values['yhat'].iloc[-1]
                    
                    if start_value > 0:
                        growth_rate = ((end_value - start_value) / start_value) * 100
                        growth_rate = np.clip(growth_rate, -100, 500)
                        growth_rates[lang] = growth_rate
        
        #Tri des langages par taux de croissance
        sorted_growth = sorted(growth_rates.items(), key=lambda x: x[1], reverse=True)
        
        #Diviser clairement les langages en croissance et en déclin
        growing_langs = [item for item in sorted_growth if item[1] > 0]
        declining_langs = [item for item in sorted_growth if item[1] <= 0]
        
        insights.append("Langages avec la plus forte croissance prévue:")
        for lang, rate in growing_langs[:3]:
            insights.append(f"- {lang}: {rate:.2f}% de croissance relative attendue")
        
        if not growing_langs:
            insights.append("- Aucun langage avec une croissance positive identifiée")
        
        insights.append("\nLangages en déclin:")
        for lang, rate in declining_langs[:3]:
            insights.append(f"- {lang}: {rate:.2f}% de croissance relative attendue")
        
        if not declining_langs:
            insights.append("- Aucun langage en déclin identifié")
    
    if feature_importance is not None:
        lang_features = [f for f in feature_importance['Feature'] if f.startswith('lang_')]
        
        if lang_features:
            top_lang_features = feature_importance[feature_importance['Feature'].isin(lang_features)]
            
            insights.append("\nFacteurs de succès par langage:")
            for _, row in top_lang_features.head(5).iterrows():
                lang_name = row['Feature'].replace('lang_', '')
                insights.append(f"- {lang_name}: Score d'importance {row['Importance']:.4f}")
    
    # Recommandations générales
    insights.append("\nRecommandations pour les universités et les décideurs:")
    insights.append("1. Se concentrer sur l'enseignement des langages à forte croissance")
    insights.append("2. Intégrer des projets pratiques utilisant ces langages")
    insights.append("3. Développer des partenariats avec des entreprises utilisant ces technologies")
    insights.append("4. Surveiller l'évolution des écosystèmes autour de ces langages")
    
    # Ajouter une section d'analyse de fiabilité
    insights.append("\nNote sur la fiabilité des prédictions:")
    insights.append("Les prédictions à long terme doivent être interprétées avec prudence.")
    insights.append("Les valeurs MAPE indiquent la précision du modèle (plus le pourcentage est bas, plus le modèle est précis).")
    insights.append("Les langages avec moins de points de données historiques peuvent avoir des prédictions moins fiables.")
    
    return "\n".join(insights)

# Fonctions d'interface Gradio
def fetch_data():
    data, message = get_data_from_mongodb()
    if not data:
        return message, None, None, None
    
    df, preprocess_msg = preprocess_data(data)
    
    # Conversion du DataFrame en JSON pour stockage dans l'état Gradio
    df_json = df.to_json(date_format='iso')
    
    return f"{message}\n{preprocess_msg}", df.head(10).to_html(), f"Total des enregistrements: {len(df)}", df_json

def run_analysis(df_json, top_n, prediction_years):
    if not df_json:
        return "Aucune donnée disponible. Veuillez d'abord récupérer les données.", None, None, None, None
    
    df = pd.read_json(df_json)
    
    top_n = int(top_n)
    prediction_years = int(prediction_years)
    
    results, eda_msg = exploratory_analysis(df)
    
    # Préparation pour la prédiction
    time_series, normalized_time_series, top_langs, prep_msg = prepare_for_prediction(df, top_languages=top_n)
    
    if time_series is None:
        return f"{eda_msg}\n{prep_msg}", "top_languages.png", "engagement_score.png", None, None
    
    predictions, normalized_predictions, model_results, prophet_msg = build_prophet_models(
        time_series, normalized_time_series, top_langs, years_to_predict=prediction_years
    )
    
    regression_model, scaler, feature_importance, model_metrics = build_regression_model(df, top_langs)
    insights = generate_insights(time_series, normalized_predictions, top_langs, feature_importance)
    
    mape_table = "<h3>Erreur MAPE par langage</h3><table>"
    mape_table += "<tr><th>Langage</th><th>MAPE</th><th>Statut</th></tr>"
    
    for result in model_results:
        language = result.get("language", "")
        mape = result.get("mape", "N/A")
        status = result.get("status", "")
        mape_table += f"<tr><td>{language}</td><td>{mape}</td><td>{status}</td></tr>"
    
    mape_table += "</table>"
    
    status_message = f"{eda_msg}\n{prep_msg}\n{prophet_msg}"
    if isinstance(model_metrics, dict):
        status_message += f"\nMétriques du modèle de régression: {model_metrics}"
    else:
        status_message += f"\n{model_metrics}"
    
    insights_html = insights.replace("\n", "<br>")  
    insights_html = f"<p>{insights_html}</p>"
    
    return status_message, "top_languages.png", "language_trends.png", "language_predictions.png",  insights_html, mape_table

# Fonction build_regression_model
def build_regression_model(df, top_langs):

    df_model = df[df['primary_language'].isin(top_langs)].copy()
    
    df_model = pd.get_dummies(df_model, columns=['primary_language'], prefix='lang')
    
    # Sélection des caractéristiques et de la cible
    features = ['repo_age_days', 'year_created'] + [col for col in df_model.columns if col.startswith('lang_')]
    target = 'engagement_score'
    
    # Filtrage des caractéristiques disponibles
    features = [f for f in features if f in df_model.columns]
    
    if not features or target not in df_model.columns:
        return None, None, None, "Données insuffisantes pour le modèle de régression"
    
    X = df_model[features]
    y = df_model[target]
    
    # Division entraînement/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Normalisation des caractéristiques
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Entraînement du modèle RandomForest
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Évaluation du modèle
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Importance des caractéristiques
    feature_importance = pd.DataFrame({
        'Feature': features,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10))
    plt.title('Importance des caractéristiques pour la prédiction du score d\'engagement')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    #Sauvegarde du modèle
    joblib.dump(model, 'models/language_popularity_model.pkl')
    joblib.dump(scaler, 'models/language_popularity_scaler.pkl')
    
    model_metrics = {
        'RMSE': f"{rmse:.4f}",
        'MAE': f"{mae:.4f}",
        'R²': f"{r2:.4f}"
    }
    
    return model, scaler, feature_importance, model_metrics

# Créer l'interface Gradio
def create_interface():
    with gr.Blocks(title="Analyse des tendances des langages GitHub") as app:
        gr.Markdown("# Outil d'analyse des tendances des langages GitHub")
        gr.Markdown("Analysez et prédisez les tendances des langages de programmation basées sur les données des dépôts GitHub")
        
        # État partagé pour le DataFrame
        df_json = gr.State(value=None)
        
        with gr.Tab("Collecte de données"):
            fetch_btn = gr.Button("Récupérer les données depuis MongoDB")
            status_output = gr.Textbox(label="Statut")
            df_preview = gr.HTML(label="Aperçu des données")
            df_info = gr.Textbox(label="Infos du jeu de données")
            
            fetch_btn.click(
                fn=fetch_data,
                outputs=[status_output, df_preview, df_info, df_json]
            )
        
        with gr.Tab("Analyse & Prédiction"):
            with gr.Row():
                top_n = gr.Slider(minimum=5, maximum=20, value=10, step=1, label="Nombre de langages principaux")
                prediction_years = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Années à prédire")
            
            run_btn = gr.Button("Lancer l'analyse")
            
            analysis_status = gr.Textbox(label="Statut de l'analyse")
            
            with gr.Row():
                lang_dist_plot = gr.Image(label="Distribution des langages")
                trend_plot = gr.Image(label="Tendances des langages au fil du temps")
            
            prediction_plot = gr.Image(label="Prédictions futures")
            
            insights_output = gr.HTML(label="Insights & Recommandations")
            mape_output = gr.HTML(label="Erreur MAPE par langage")

            run_btn.click(
                fn=run_analysis,
                inputs=[df_json, top_n, prediction_years],
                outputs=[analysis_status, lang_dist_plot, trend_plot, prediction_plot, insights_output, mape_output]
            )
    
    return app
    
def main():
    # Création de l'interface Gradio
    app = create_interface()
    
    # Lancement de l'application
    app.launch(share=True)

if __name__ == "__main__":
    main()

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://b392e97002053d2582.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)



 Prétraitement terminé : 869 lignes et 15 colonnes restantes après nettoyage.
 Langages uniques après filtrage : 46
 Période couverte : 2008 - 2024


23:03:53 - cmdstanpy - INFO - Chain [1] start processing
23:03:53 - cmdstanpy - INFO - Chain [1] done processing
23:03:54 - cmdstanpy - INFO - Chain [1] start processing
23:03:55 - cmdstanpy - INFO - Chain [1] done processing
23:03:55 - cmdstanpy - INFO - Chain [1] start processing
23:03:55 - cmdstanpy - INFO - Chain [1] done processing
23:03:56 - cmdstanpy - INFO - Chain [1] start processing
23:03:56 - cmdstanpy - INFO - Chain [1] done processing
23:03:56 - cmdstanpy - INFO - Chain [1] start processing
23:03:56 - cmdstanpy - INFO - Chain [1] done processing
23:03:58 - cmdstanpy - INFO - Chain [1] start processing
23:03:58 - cmdstanpy - INFO - Chain [1] done processing
23:03:58 - cmdstanpy - INFO - Chain [1] start processing
23:03:58 - cmdstanpy - INFO - Chain [1] done processing
23:03:59 - cmdstanpy - INFO - Chain [1] start processing
23:03:59 - cmdstanpy - INFO - Chain [1] done processing
23:04:00 - cmdstanpy - INFO - Chain [1] start processing
23:04:00 - cmdstanpy - INFO - Chain [1]


 Prétraitement terminé : 869 lignes et 15 colonnes restantes après nettoyage.
 Langages uniques après filtrage : 46
 Période couverte : 2008 - 2024


23:05:54 - cmdstanpy - INFO - Chain [1] start processing
23:05:54 - cmdstanpy - INFO - Chain [1] done processing
23:05:56 - cmdstanpy - INFO - Chain [1] start processing
23:05:56 - cmdstanpy - INFO - Chain [1] done processing
23:05:56 - cmdstanpy - INFO - Chain [1] start processing
23:05:56 - cmdstanpy - INFO - Chain [1] done processing
23:05:57 - cmdstanpy - INFO - Chain [1] start processing
23:05:57 - cmdstanpy - INFO - Chain [1] done processing
23:05:58 - cmdstanpy - INFO - Chain [1] start processing
23:05:58 - cmdstanpy - INFO - Chain [1] done processing
23:05:59 - cmdstanpy - INFO - Chain [1] start processing
23:05:59 - cmdstanpy - INFO - Chain [1] done processing
23:05:59 - cmdstanpy - INFO - Chain [1] start processing
23:05:59 - cmdstanpy - INFO - Chain [1] done processing
23:06:00 - cmdstanpy - INFO - Chain [1] start processing
23:06:00 - cmdstanpy - INFO - Chain [1] done processing
23:06:01 - cmdstanpy - INFO - Chain [1] start processing
23:06:01 - cmdstanpy - INFO - Chain [1]

# Step 3 : Proposer une solution d’évaluation et de monitoring de vos modèles ML 

### Je n'ai pas eu le temps de finir cette partie 

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
from prophet import Prophet
import warnings
import os
import gradio as gr
import matplotlib
import json
import time

# Utiliser un backend non-interactif pour Gradio
matplotlib.use('Agg')

warnings.filterwarnings('ignore')

#Créer les répertoires nécessaires
os.makedirs('models', exist_ok=True)
os.makedirs('metrics', exist_ok=True)


#Intégration de MLflow pour le suivi des métriques
import mlflow
import mlflow.sklearn

def setup_mlflow_tracking():
    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    mlflow.set_experiment("github_language_trends")

def log_model_metrics(metrics, model, params=None, artifacts=None):
    with mlflow.start_run():
        if params:
            for key, value in params.items():
                mlflow.log_param(key, value)
        for key, value in metrics.items():
            mlflow.log_metric(key, float(value))
        mlflow.sklearn.log_model(model, "model")
        if artifacts:
            for name, path in artifacts.items():
                mlflow.log_artifact(path, name)


# Connexion à MongoDB et récupération des données

def get_data_from_mongodb():
    try:
        client = MongoClient("mongodb+srv://ufacikfatih:byJlFI7t6Lb3CFyN@cluster0.krkuu.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
        db = client["Projet_BigData"]
        collection = db["Data_Kafka"]
        data = list(collection.find({}))
        client.close()
        return data, f"Récupération réussie de {len(data)} documents depuis MongoDB"
    except Exception as e:
        return [], f"Erreur lors de la récupération des données: {e}"


# Prétraitement des données

def preprocess_data(data):
    df = pd.DataFrame(data)
    if '_id' in df.columns:
        df = df.drop('_id', axis=1)
    if 'created_at' in df.columns:
        df['created_at'] = pd.to_datetime(df['created_at'])
    if 'updated_at' in df.columns:
        df['updated_at'] = pd.to_datetime(df['updated_at'])
    if 'imported_at' in df.columns:
        df['imported_at'] = pd.to_datetime(df['imported_at'])
    if 'created_at' in df.columns:
        df['created_at'] = df['created_at'].dt.tz_localize(None)
    if 'updated_at' in df.columns:
        df['updated_at'] = df['updated_at'].dt.tz_localize(None)
    if 'imported_at' in df.columns:
        df['imported_at'] = df['imported_at'].dt.tz_localize(None)
    if 'created_at' in df.columns:
        df['repo_age_days'] = (datetime.now() - df['created_at']).dt.days
        df['year_created'] = df['created_at'].dt.year
        df['month_created'] = df['created_at'].dt.month
    df = df[df['primary_language'].notna()]
    df['engagement_score'] = (df['stars'] * 2 + df['forks'] * 3 + df['watchers']) / (df['repo_age_days'] + 1)
    return df, f"Jeu de données prétraité avec {df.shape[0]} lignes et {df.shape[1]} colonnes"


# Analyse exploratoire des données

def exploratory_analysis(df):
    results = {}
    language_counts = df['primary_language'].value_counts().head(10)
    results['language_counts'] = language_counts
    language_stats = df.groupby('primary_language').agg({
        'stars': 'mean',
        'forks': 'mean',
        'watchers': 'mean',
        'engagement_score': 'mean',
        'id': 'count'
    }).sort_values('id', ascending=False).head(10)
    language_stats.columns = ['Étoiles (moy)', 'Forks (moy)', 'Observateurs (moy)', "Score d'engagement", 'Nombre de dépôts']
    results['language_stats'] = language_stats
    if 'year_created' in df.columns:
        yearly_trend = df.groupby(['year_created', 'primary_language']).size().unstack().fillna(0)
        results['yearly_trend'] = yearly_trend
    plt.figure(figsize=(10, 6))
    language_counts.plot(kind='bar', color='skyblue')
    plt.title('Top 10 des langages de programmation')
    plt.xlabel('Langage')
    plt.ylabel('Nombre de dépôts')
    plt.tight_layout()
    plt.savefig('top_languages.png')
    plt.close()
    
    plt.figure(figsize=(10, 6))
    language_stats["Score d'engagement"].sort_values(ascending=False).plot(kind='bar', color='lightgreen')
    plt.title("Score d'engagement moyen par langage")
    plt.xlabel('Langage')
    plt.ylabel("Score d'engagement")
    plt.tight_layout()
    plt.savefig('engagement_score.png')
    plt.close()
    
    if 'yearly_trend' in results:
        plt.figure(figsize=(12, 7))
        yearly_trend.sum(axis=1).plot(kind='line', marker='o')
        plt.title('Création de dépôts par année')
        plt.xlabel('Année')
        plt.ylabel('Nombre de dépôts')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig('yearly_trend.png')
        plt.close()
    
    return results, "Analyse exploratoire terminée"


# Préparation des données pour la prédiction

def prepare_for_prediction(df, top_languages=10):
    top_langs = df['primary_language'].value_counts().head(top_languages).index.tolist()
    df_top = df[df['primary_language'].isin(top_langs)].copy()
    if 'year_created' in df.columns:
        time_series = df_top.groupby(['year_created', 'primary_language']).size().unstack().fillna(0)
        min_years_required = 4
        valid_columns = []
        for col in time_series.columns:
            if time_series[time_series[col] > 0].shape[0] >= min_years_required:
                valid_columns.append(col)
        if not valid_columns:
            return None, None, top_langs, "Données temporelles insuffisantes pour la prédiction"
        time_series = time_series[valid_columns]
        yearly_totals = time_series.sum(axis=1)
        normalized_time_series = time_series.div(yearly_totals, axis=0) * 100
        plt.figure(figsize=(12, 7))
        for lang in valid_columns[:5]:
            normalized_time_series[lang].plot(label=lang)
        plt.title('Popularité relative des principaux langages de programmation')
        plt.xlabel('Année')
        plt.ylabel('Pourcentage (%)')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig('language_trends.png')
        plt.close()
        return time_series, normalized_time_series, valid_columns, "Données de séries temporelles préparées pour la prédiction"
    else:
        return None, None, top_langs, "Données temporelles insuffisantes pour la prédiction"


# Modèle de prédiction Prophet pour chaque langage

def build_prophet_models(time_series, normalized_time_series, top_langs, years_to_predict=5):
    predictions = {}
    normalized_predictions = {}
    mape_scores = {}
    model_results = []
    # Utilisation de paramètres optimisés pour Prophet
    prophet_params = {"changepoint_prior_scale": 0.02, "changepoint_range": 0.8, "interval_width": 0.95}
    prophet_norm_params = {"changepoint_prior_scale": 0.05, "changepoint_range": 0.8, "interval_width": 0.95}
    for language in top_langs:
        if language in time_series.columns:
            lang_results = {"language": language}
            df_prophet = pd.DataFrame({
                'ds': pd.to_datetime(time_series.index.astype(str) + '-01-01'),
                'y': time_series[language].values
            })
            df_prophet = df_prophet[df_prophet['y'] > 0].reset_index(drop=True)
            if len(df_prophet) < 5:
                lang_results["status"] = "Données insuffisantes"
                model_results.append(lang_results)
                continue
            try:
                model = Prophet(**prophet_params)
                model.fit(df_prophet)
                future = model.make_future_dataframe(periods=years_to_predict, freq='Y')
                forecast = model.predict(future)
                forecast['yhat'] = np.maximum(forecast['yhat'], 0)
                forecast['yhat_lower'] = np.maximum(forecast['yhat_lower'], 0)
                predictions[language] = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
                historical_dates = df_prophet['ds'].values
                forecast_hist = forecast[forecast['ds'].isin(historical_dates)]
                comparison = pd.merge(df_prophet[['ds', 'y']], forecast_hist[['ds', 'yhat']], on='ds')
                comparison = comparison[(comparison['y'] > 0) & (comparison['yhat'] > 0)]
                if not comparison.empty:
                    abs_perc_errors = np.abs((comparison['y'] - comparison['yhat']) / comparison['y']) * 100
                    mape = min(abs_perc_errors.mean(), 100)
                    mape_scores[language] = mape
                    lang_results["mape"] = f"{mape:.2f}%"
                plt.figure(figsize=(10, 6))
                plt.plot(df_prophet['ds'], df_prophet['y'], 'ko', markersize=6, label='Réel')
                plt.plot(forecast['ds'], forecast['yhat'], 'steelblue', linewidth=2, label='Prévision')
                plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color='steelblue', alpha=0.2)
                plt.title(f'Prédiction pour {language}')
                plt.xlabel('Année')
                plt.ylabel('Nombre de dépôts')
                plt.legend()
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.tight_layout()
                fig_path = f'prediction_{language}.png'
                plt.savefig(fig_path)
                plt.close()
                lang_results["status"] = "Succès"
                lang_results["figure_path"] = fig_path
                if language in normalized_time_series.columns:
                    df_prophet_norm = pd.DataFrame({
                        'ds': pd.to_datetime(normalized_time_series.index.astype(str) + '-01-01'),
                        'y': normalized_time_series[language].values
                    })
                    df_prophet_norm = df_prophet_norm[df_prophet_norm['y'] > 0].reset_index(drop=True)
                    if len(df_prophet_norm) >= 5:
                        model_norm = Prophet(**prophet_norm_params)
                        model_norm.fit(df_prophet_norm)
                        future_norm = model_norm.make_future_dataframe(periods=years_to_predict, freq='Y')
                        forecast_norm = model_norm.predict(future_norm)
                        forecast_norm['yhat'] = np.clip(forecast_norm['yhat'], 0, 100)
                        forecast_norm['yhat_lower'] = np.clip(forecast_norm['yhat_lower'], 0, 100)
                        forecast_norm['yhat_upper'] = np.clip(forecast_norm['yhat_upper'], 0, 100)
                        normalized_predictions[language] = forecast_norm[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
            except Exception as e:
                lang_results["status"] = f"Erreur: {str(e)}"
            model_results.append(lang_results)
    mape_summary = "\n=== Erreur MAPE pour chaque langage ===\n"
    for lang, mape in mape_scores.items():
        mape_summary += f"{lang}: {mape:.2f}%\n"
    plt.figure(figsize=(14, 8))
    for i, lang in enumerate(top_langs[:5]):
        if lang in normalized_predictions:
            pred = normalized_predictions[lang]
            plt.plot(pred['ds'], pred['yhat'], label=lang)
            plt.fill_between(pred['ds'], pred['yhat_lower'], pred['yhat_upper'], alpha=0.2)
    plt.title('Part de marché relative prédite des langages de programmation')
    plt.xlabel('Année')
    plt.ylabel('Pourcentage relatif (%)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('language_predictions.png')
    plt.close()
    return predictions, normalized_predictions, model_results, f"Modèles Prophet construits avec succès{mape_summary}"

# Générer des insights et recommandations

def generate_insights(time_series, normalized_predictions, top_langs, feature_importance):
    insights = []
    if normalized_predictions:
        growth_rates = {}
        for lang in top_langs:
            if lang in normalized_predictions:
                pred = normalized_predictions[lang]
                future_values = pred[pred['ds'] > datetime.now()]
                if not future_values.empty and len(future_values) >= 2:
                    start_value = future_values['yhat'].iloc[0]
                    end_value = future_values['yhat'].iloc[-1]
                    if start_value > 0:
                        growth_rate = ((end_value - start_value) / start_value) * 100
                        growth_rate = np.clip(growth_rate, -100, 500)
                        growth_rates[lang] = growth_rate
        sorted_growth = sorted(growth_rates.items(), key=lambda x: x[1], reverse=True)
        growing_langs = [item for item in sorted_growth if item[1] > 0]
        declining_langs = [item for item in sorted_growth if item[1] <= 0]
        insights.append("Langages avec la plus forte croissance prévue:")
        for lang, rate in growing_langs[:3]:
            insights.append(f"- {lang}: {rate:.2f}% de croissance relative attendue")
        if not growing_langs:
            insights.append("- Aucun langage avec une croissance positive identifiée")
        insights.append("\nLangages en déclin:")
        for lang, rate in declining_langs[:3]:
            insights.append(f"- {lang}: {rate:.2f}% de croissance relative attendue")
        if not declining_langs:
            insights.append("- Aucun langage en déclin identifié")
    if feature_importance is not None:
        lang_features = [f for f in feature_importance['Feature'] if f.startswith('lang_')]
        if lang_features:
            top_lang_features = feature_importance[feature_importance['Feature'].isin(lang_features)]
            insights.append("\nFacteurs de succès par langage:")
            for _, row in top_lang_features.head(5).iterrows():
                lang_name = row['Feature'].replace('lang_', '')
                insights.append(f"- {lang_name}: Score d'importance {row['Importance']:.4f}")
    insights.append("\nRecommandations pour les universités et décideurs:")
    insights.append("1. Se concentrer sur l'enseignement des langages à forte croissance")
    insights.append("2. Intégrer des projets pratiques utilisant ces langages")
    insights.append("3. Développer des partenariats avec des entreprises technologiques")
    insights.append("4. Surveiller l'évolution des écosystèmes autour de ces langages")
    insights.append("\nNote sur la fiabilité des prédictions:")
    insights.append("Les prédictions à long terme doivent être interprétées avec prudence.")
    insights.append("Les valeurs MAPE indiquent la précision du modèle (plus le pourcentage est bas, plus le modèle est précis).")
    insights.append("Les langages avec moins de points de données historiques peuvent avoir des prédictions moins fiables.")
    return "\n".join(insights)


# Modèle de régression (avec GridSearchCV)

def build_regression_model(df, top_langs):
    df_model = df[df['primary_language'].isin(top_langs)].copy()
    df_model = pd.get_dummies(df_model, columns=['primary_language'], prefix='lang')
    features = ['repo_age_days', 'year_created'] + [col for col in df_model.columns if col.startswith('lang_')]
    target = 'engagement_score'
    features = [f for f in features if f in df_model.columns]
    if not features or target not in df_model.columns:
        return None, None, None, "Données insuffisantes pour le modèle de régression"
    X = df_model[features]
    y = df_model[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
    }
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    feature_importance = pd.DataFrame({
        'Feature': features,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10))
    plt.title("Importance des caractéristiques pour la prédiction du score d'engagement")
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    joblib.dump(best_model, 'models/language_popularity_model.pkl')
    joblib.dump(scaler, 'models/language_popularity_scaler.pkl')
    model_metrics = {
        'RMSE': f"{rmse:.4f}",
        'MAE': f"{mae:.4f}",
        'R²': f"{r2:.4f}"
    }
    log_model_metrics(model_metrics, best_model, params={"top_langs": top_langs}, artifacts={"feature_importance": "feature_importance.png"})
    return best_model, scaler, feature_importance, model_metrics


# Fonctions de monitoring et réentraînement

def setup_model_monitoring(model_name):
    os.makedirs('metrics', exist_ok=True)
    log_file = f'metrics/{model_name}_performance.csv'
    if not os.path.exists(log_file):
        with open(log_file, 'w') as f:
            f.write('timestamp,rmse,mae,r2,data_drift_score,concept_drift_detected\n')
    return log_file

def detect_data_drift(X_reference, X_current, threshold=0.05):
    from scipy import stats
    drift_scores = {}
    drift_detected = False
    for feature in X_reference.columns:
        if X_reference[feature].dtype in [np.float64, np.int64]:
            ks_stat, p_value = stats.ks_2samp(X_reference[feature], X_current[feature])
            drift_scores[feature] = ks_stat
            if p_value < threshold:
                drift_detected = True
    avg_drift_score = np.mean(list(drift_scores.values()))
    return drift_detected, avg_drift_score, drift_scores

def evaluate_and_retrain(model, X_test, y_test, scaler, features, threshold=0.1, log_file=None):
    X_test_scaled = scaler.transform(X_test)
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    X_reference = joblib.load('models/reference_data.pkl') if os.path.exists('models/reference_data.pkl') else X_test
    drift_detected, drift_score, _ = detect_data_drift(X_reference, X_test)
    if log_file:
        with open(log_file, 'a') as f:
            f.write(f"{datetime.now().isoformat()},{rmse},{mae},{r2},{drift_score},{drift_detected}\n")
    should_retrain = (r2 < threshold) or drift_detected
    if should_retrain:
        data, _ = get_data_from_mongodb()
        df, _ = preprocess_data(data)
        new_model, new_scaler, _, _ = build_regression_model(df, features)
        joblib.dump(new_model, 'models/language_popularity_model.pkl')
        joblib.dump(new_scaler, 'models/language_popularity_scaler.pkl')
        joblib.dump(X_test, 'models/reference_data.pkl')
        return new_model, new_scaler, True, {'RMSE': rmse, 'MAE': mae, 'R²': r2}
    return model, scaler, False, {'RMSE': rmse, 'MAE': mae, 'R²': r2}

def run_evaluation(model_name, threshold, df_json):
    if not df_json:
        return "Aucune donnée disponible pour l'évaluation.", "top_languages.png", "yearly_trend.png"
    df = pd.read_json(df_json)
    features = ['repo_age_days', 'year_created'] + [col for col in df.columns if col.startswith('lang_')]
    features = [f for f in features if f in df.columns]
    target = 'engagement_score'
    X = df[features]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler().fit(X_train)
    try:
        model = joblib.load('models/language_popularity_model.pkl')
    except Exception as e:
        return f"Erreur lors du chargement du modèle: {e}", "top_languages.png", "yearly_trend.png"
    log_file = setup_model_monitoring(model_name)
    new_model, new_scaler, retrained, metrics_dict = evaluate_and_retrain(model, X_test, y_test, scaler, features, threshold=threshold, log_file=log_file)
    status = f"Évaluation terminée. Réentraînement {'effectué' if retrained else 'non nécessaire'}. Metrics: {metrics_dict}"
    return status, "top_languages.png", "yearly_trend.png"

# Fonctions pour l'apprentissage en streaming

def setup_streaming_model(features):
    from river import forest, compose, preprocessing, metrics as river_metrics
    model = compose.Pipeline(
        preprocessing.StandardScaler(),
        forest.AMFRegressor(n_models=10, seed=42)
    )
    metric = river_metrics.RollingRMSE(window_size=100)
    return model, metric

def update_streaming_model(stream_model, metric, features, target):
    x = {feature: features[feature] for feature in features.index}
    y_pred = stream_model.predict_one(x)
    if target is not None:
        metric.update(target, y_pred)
        stream_model.learn_one(x, target)
    return stream_model, metric, y_pred

def start_streaming():
    stream_model, metric = setup_streaming_model([])
    for i in range(10):
        fake_features = pd.Series({'repo_age_days': np.random.randint(1, 1000), 'year_created': np.random.randint(2000, 2023)})
        fake_target = np.random.random() * 100
        stream_model, metric, y_pred = update_streaming_model(stream_model, metric, fake_features, fake_target)
        time.sleep(0.5)
    status = "Le modèle de streaming a été mis à jour avec des données simulées."
    return status, "language_trends.png"


# Interface FastAPI pour la prédiction en temps réel

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

app = FastAPI()

class DataPoint(BaseModel):
    features: dict

@app.post("/predict")
async def predict(data: DataPoint):
    try:
        model = joblib.load('models/language_popularity_model.pkl')
        scaler = joblib.load('models/language_popularity_scaler.pkl')
        features = pd.DataFrame([data.features])
        features_scaled = scaler.transform(features)
        prediction = model.predict(features_scaled)[0]
        return {"prediction": prediction}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

def start_prediction_service():
    uvicorn.run(app, host="0.0.0.0", port=8000)


# Fonctions d'interface Gradio

def fetch_data():
    data, message = get_data_from_mongodb()
    if not data:
        return message, None, None, None
    df, preprocess_msg = preprocess_data(data)
    df_json = df.to_json(date_format='iso')
    return f"{message}\n{preprocess_msg}", df.head(10).to_html(), f"Total des enregistrements: {len(df)}", df_json

def run_analysis(df_json, top_n, prediction_years):
    if not df_json:
        return "Aucune donnée disponible. Veuillez d'abord récupérer les données.", None, None, None, None
    df = pd.read_json(df_json)
    top_n = int(top_n)
    prediction_years = int(prediction_years)
    results, eda_msg = exploratory_analysis(df)
    time_series, normalized_time_series, top_langs, prep_msg = prepare_for_prediction(df, top_languages=top_n)
    if time_series is None:
        return f"{eda_msg}\n{prep_msg}", "top_languages.png", "engagement_score.png", None, None
    predictions, normalized_predictions, model_results, prophet_msg = build_prophet_models(
        time_series, normalized_time_series, top_langs, years_to_predict=prediction_years
    )
    regression_model, scaler, feature_importance, model_metrics = build_regression_model(df, top_langs)
    insights = generate_insights(time_series, normalized_predictions, top_langs, feature_importance)
    mape_table = "<h3>Erreur MAPE par langage</h3><table>"
    mape_table += "<tr><th>Langage</th><th>MAPE</th><th>Statut</th></tr>"
    for result in model_results:
        language = result.get("language", "")
        mape = result.get("mape", "N/A")
        status = result.get("status", "")
        mape_table += f"<tr><td>{language}</td><td>{mape}</td><td>{status}</td></tr>"
    mape_table += "</table>"
    status_message = f"{eda_msg}\n{prep_msg}\n{prophet_msg}"
    if isinstance(model_metrics, dict):
        status_message += f"\nMétriques du modèle de régression: {model_metrics}"
    else:
        status_message += f"\n{model_metrics}"
    insights += "\n\n" + mape_table
    return status_message, "top_languages.png", "language_trends.png", "language_predictions.png", insights

def create_interface():
    with gr.Blocks(title="Analyse des tendances des langages GitHub") as app:
        gr.Markdown("# Outil d'analyse des tendances des langages GitHub")
        gr.Markdown("Analysez et prédisez les tendances des langages de programmation basées sur les données des dépôts GitHub")
        df_json = gr.State(value=None)
        
        with gr.Tab("Collecte de données"):
            fetch_btn = gr.Button("Récupérer les données depuis MongoDB")
            status_output = gr.Textbox(label="Statut")
            df_preview = gr.HTML(label="Aperçu des données")
            df_info = gr.Textbox(label="Infos du jeu de données")
            fetch_btn.click(
                fn=fetch_data,
                outputs=[status_output, df_preview, df_info, df_json]
            )
        
        with gr.Tab("Analyse & Prédiction"):
            with gr.Row():
                top_n = gr.Slider(minimum=5, maximum=20, value=10, step=1, label="Nombre de langages principaux")
                prediction_years = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Années à prédire")
            run_btn = gr.Button("Lancer l'analyse")
            analysis_status = gr.Textbox(label="Statut de l'analyse")
            with gr.Row():
                lang_dist_plot = gr.Image(label="Distribution des langages")
                trend_plot = gr.Image(label="Tendances des langages au fil du temps")
            prediction_plot = gr.Image(label="Prédictions futures")
            insights_output = gr.Textbox(label="Insights & Recommandations", lines=15)
            run_btn.click(
                fn=run_analysis,
                inputs=[df_json, top_n, prediction_years],
                outputs=[analysis_status, lang_dist_plot, trend_plot, prediction_plot, insights_output]
            )
        
        with gr.Tab("Monitoring & Évaluation"):
            with gr.Row():
                model_selector = gr.Dropdown(
                    choices=["language_popularity_model", "prophet_model"], 
                    label="Sélectionner un modèle"
                )
                eval_threshold = gr.Slider(
                    minimum=0.1, maximum=0.9, value=0.5, step=0.1, 
                    label="Seuil pour le réentraînement (R²)"
                )
            eval_btn = gr.Button("Évaluer et réentraîner si nécessaire")
            eval_status = gr.Textbox(label="Statut de l'évaluation")
            metrics_plot = gr.Plot(label="Évolution des métriques")
            drift_plot = gr.Plot(label="Détection de drift")
            eval_btn.click(
                fn=run_evaluation,
                inputs=[model_selector, eval_threshold, df_json],
                outputs=[eval_status, metrics_plot, drift_plot]
            )
            
            with gr.Tab("Apprentissage en temps réel"):
                start_stream_btn = gr.Button("Démarrer l'apprentissage en streaming")
                stream_status = gr.Textbox(label="Statut du streaming")
                stream_metrics = gr.Plot(label="Métriques en temps réel")
                start_stream_btn.click(
                    fn=start_streaming,
                    outputs=[stream_status, stream_metrics]
                )
    return app

# Fonction principale

def main():
    setup_mlflow_tracking()
    import threading
    prediction_thread = threading.Thread(target=start_prediction_service)
    prediction_thread.daemon = True
    prediction_thread.start()
    app_interface = create_interface()
    app_interface.launch(share=True)

if __name__ == "__main__":
    main()

INFO:     Started server process [14384]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
ERROR:    [Errno 10048] error while attempting to bind on address ('0.0.0.0', 8000): une seule utilisation de chaque adresse de socket (protocole/adresse réseau/port) est habituellement autorisée
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.


Running on local URL:  http://127.0.0.1:7864
Running on public URL: https://d4ec722d8d55af83d4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


23:13:18 - cmdstanpy - INFO - Chain [1] start processing
23:13:18 - cmdstanpy - INFO - Chain [1] done processing
23:13:19 - cmdstanpy - INFO - Chain [1] start processing
23:13:19 - cmdstanpy - INFO - Chain [1] done processing
23:13:19 - cmdstanpy - INFO - Chain [1] start processing
23:13:19 - cmdstanpy - INFO - Chain [1] done processing
23:13:20 - cmdstanpy - INFO - Chain [1] start processing
23:13:21 - cmdstanpy - INFO - Chain [1] done processing
23:13:21 - cmdstanpy - INFO - Chain [1] start processing
23:13:21 - cmdstanpy - INFO - Chain [1] done processing
23:13:22 - cmdstanpy - INFO - Chain [1] start processing
23:13:22 - cmdstanpy - INFO - Chain [1] done processing
23:13:23 - cmdstanpy - INFO - Chain [1] start processing
23:13:23 - cmdstanpy - INFO - Chain [1] done processing
23:13:24 - cmdstanpy - INFO - Chain [1] start processing
23:13:24 - cmdstanpy - INFO - Chain [1] done processing
23:13:24 - cmdstanpy - INFO - Chain [1] start processing
23:13:24 - cmdstanpy - INFO - Chain [1]