 # Récupération Donnée API Github

Chargement de la clé github 

In [2]:
%pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

CLE_API_GITHUB = os.getenv('CLE_API_GITHUB')

In [2]:
import requests
import random
import base64


HEADERS = {"Authorization": f"token {CLE_API_GITHUB}"}

def get_repos(nb_repos=20):
    # Recherche générale avec le mot "a" (ou autre lettre)
    url = "https://api.github.com/search/repositories"
    params = {
        "q": "angular", # recherche avec la lettre "a"
        "order": "desc",
        "per_page": nb_repos, 
        "page": random.randint(1, 100), # page random entre 1 et 100
        }  
    response = requests.get(url, headers=HEADERS, params=params)
    data_repos = []
    
    if response.status_code == 200:
        data = response.json()
        repos = data.get("items", [])
        if repos:
            for repo in repos:
                if(fetch_and_check_readme(repo)):
                    data_repos.append({
                        "name": repo["name"],
                        "full_name": repo["full_name"],
                        "html_url": repo["html_url"],
                        "description": repo["description"],
                        "readme" : repo["readme_content"]
                    })
            return data_repos
        else:
            print("Aucun dépôt trouvé.")
            return []
    else:
        print( f"Erreur: {response.status_code}, {response.text}")
        return []


def fetch_and_check_readme(repo):
    readme_url = repo["url"] + "/readme"
    response = requests.get(readme_url, headers=HEADERS)
    readme_is_present = False
    if response.status_code == 200:
        data = response.json()
        if "content" in data:
            # Décoder le contenu encodé en base64
            content = base64.b64decode(data["content"]).decode("utf-8").strip()
            if content:  # Vérifier que le README n'est pas vide
                repo["readme_content"] = content
                readme_is_present = True
    return readme_is_present



In [4]:
import json

repo = get_repos()
print(f"\n liste des repos : {json.dumps(repo, indent=20)}")


 liste des repos : [
                    {
                                        "name": "ngx-toastr",
                                        "full_name": "scttcper/ngx-toastr",
                                        "html_url": "https://github.com/scttcper/ngx-toastr",
                                        "description": "\ud83c\udf5e Angular Toastr",
                    },
                    {
                                        "name": "AngularFundamentalsFiles",
                                        "full_name": "joeeames/AngularFundamentalsFiles",
                                        "html_url": "https://github.com/joeeames/AngularFundamentalsFiles",
                                        "description": "A few files from my angular fundamentals course",
                    },
                    {
                                        "name": "ng-select",
                                        "full_name": "ng-select/ng-select",
                               

In [6]:
import streamlit as st
from sentence_transformers import SentenceTransformer
import faiss
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch 
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# # Charger le modèle et le tokenizer
# model_name = "t5-small"  # Vous pouvez utiliser un autre modèle si nécessaire
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

# from transformers import AutoTokenizer, AutoModelForCausalLM

# token = "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"
# model_name = "meta-llama/Llama-3.1-8B-Instruct"

# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     use_auth_token=token,
#     resume_download=True  # Active la reprise
# )

In [9]:
def get_user_query(subject = "Angular project of Game Uno"):
    """
    Demande un sujet à l'utilisateur.
    """
    if subject:
        return subject
    return input("Veuillez entrer un sujet pour générer un README : ")

## Groc

In [10]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def get_top_k_readmes(user_query, readmes, k=5):
    """
    Trouve les k README les plus similaires à la requête utilisateur.
    
    Args:
        user_query (str): La requête de l'utilisateur.
        readmes (list): Liste des contenus README récupérés.
        k (int): Nombre de README les plus proches à sélectionner.
    
    Returns:
        list: Les k README les plus similaires.
    """
    # Extraire les contenus de README
    readme_contents = [repo["readme"] for repo in readmes if isinstance(repo["readme"], str)]
    
    # Générer des embeddings pour la requête et les README
    all_texts = [user_query] + readme_contents
    embeddings = embedding_model.encode(all_texts)
    
    # Calculer la similarité cosinus entre la requête et les README
    query_embedding = embeddings[0].reshape(1, -1)
    readme_embeddings = embeddings[1:]
    similarities = cosine_similarity(query_embedding, readme_embeddings)[0]
    
    # Obtenir les indices des k README les plus similaires
    top_k_indices = np.argsort(similarities)[::-1][:k]
    
    # Retourner les k README les plus proches
    top_k_readmes = [readmes[i] for i in top_k_indices]
    return top_k_readmes



In [11]:
# Exemple structuré de README en Markdown
EXAMPLE_README = """
    # Example Project

    ## Description
    This is a sample project demonstrating best practices for structuring a README file. It includes sections that provide all necessary details to understand and use the project effectively.

    ## Features
    - List of features here
    - Add any additional functionality

    ## Technologies Used
    - Technology 1
    - Technology 2

    ## Installation
    1. Clone the repository:
    ```bash
    git clone https://github.com/username/example-project.git
"""


In [22]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

def prepare_input_for_t5(user_query, readmes,k=5):

    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Générer les embeddings des documents
    document_embeddings = embedding_model.encode(readmes)

    # Créer un index FAISS
    index = faiss.IndexFlatL2(document_embeddings.shape[1])
    index.add(np.array(document_embeddings).astype(np.float32))

    # Exemple de requête
    query = get_user_query()
    query_embedding = embedding_model.encode([query])

    # Rechercher les documents les plus similaires
    k = 5  # Nombre de documents à récupérer
    distances, indices = index.search(np.array(query_embedding).astype(np.float32), k)

    # Afficher les résultats
    print("Query:", query)
    print(f"\nTop {k} most similar documents:")
    top_readmes = [readmes[i] for i in indices[0]]
    for i in range(k):

        print(f"{readmes[indices[0][i]]} (distance: {distances[0][i]:.4f})")


    # Préparer l'entrée pour le modèle
    input_text = (
        f"Generate a README based on the following topic: {user_query}.\n\n"
        f"Here is an example of a well-structured README on which you can base the structure of your response:\n\n{EXAMPLE_README}\n\n"
        f"Here are some related README files:\n\n" +
        "\n\n".join([f"Repository: {repo['name']}\nREADME:\n{repo['readme']}" for repo in top_readmes]) + "\n\n"
        f"Please generate a complete and formatted README in English for my project based on the example for the structure and followed by the related README files, ready to be copied into a README.md file.\n"
        f"Do not include this prompt in the generated README."
        f"Replace by \\n the line breaks in the generated README."
    )

    return input_text

In [13]:
def generate_readme_with_t5(input_text):
    """
    Génère un README à l'aide de T5-small.
    
    Args:
        input_text (str): Texte d'entrée pour le modèle.
    
    Returns:
        str: Le README généré.
    """
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=500)
    print(f"\n\nINPUT\n\n{input_text}\n\n")
    outputs = model.generate(inputs["input_ids"], max_length=1024, num_beams=5, early_stopping=True)
    generated_readme = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_readme


In [26]:
def format_markdown(content):
    lines = content.splitlines()
    formatted_lines = []
    for line in lines:
        # Corriger les listes
        if line.strip().startswith("- "):
            formatted_lines.append(line.strip())
        # Corriger les titres
        elif line.strip().startswith("#"):
            formatted_lines.append(line.strip())
        # Corriger les images Markdown
        elif "![Image]" in line or "![Alt Text]" in line:
            formatted_lines.append(line.strip())
        # Gérer les autres lignes
        else:
            formatted_lines.append(line)
    return "\n".join(formatted_lines)

In [14]:
readmes = []

In [27]:
import mlflow
import mlflow.pyfunc
import time
import json

mlflow.set_tracking_uri("http://127.0.0.1:5000")
print("Tracking URI:", mlflow.get_tracking_uri())

with mlflow.start_run(run_name="test_run"):
    print("Run info:", mlflow.active_run().info)

    start_time = time.time()

    # Étape 1 : Récupérer les README
    if not readmes:
        readmes = get_repos(10)
        mlflow.log_param("readmes_retrieved", "No")
        print("Aucun README récupéré.")
    else:
        mlflow.log_param("readmes_retrieved", "Yes")
        mlflow.log_param("num_readmes", len(readmes))

    # Étape 2 : Demander un sujet à l'utilisateur
    user_query = get_user_query("angular-starter")
    mlflow.log_param("user_query", user_query)

    # Étape 3 : Trouver les README les plus proches
    top_readmes = get_top_k_readmes(user_query, readmes, k=4)
    with open("top_readmes.json", "w", encoding="utf-8") as file:
        json.dump([repo['readme'] for repo in top_readmes], file)
    mlflow.log_artifact("top_readmes.json")

    # Étape 4 : Préparer l'entrée pour le modèle T5
    input_text = prepare_input_for_t5(user_query, top_readmes)
    mlflow.log_text(input_text, "input_text.txt")

    print(f"\n\nINPUT TEXT\n\n{input_text}\n\n")

    # Étape 5 : Générer un README
    generated_readme = generate_readme_with_t5(input_text)

    formatted_readme = format_markdown(generated_readme)

    mlflow.log_text(formatted_readme, "generated_readme.md")

    # Sauvegarder le README dans un fichier
    with open("test.md", "w", encoding="utf-8") as file:
        file.write(formatted_readme)

    print("\nLe README a été sauvegardé dans test.md.", formatted_readme)

    # Log de l'artefact (README généré) dans MLflow
    mlflow.log_artifact("test.md")

    # Suivi de la durée d'exécution
    execution_duration = time.time() - start_time
    mlflow.log_metric("execution_duration", execution_duration)

    print("L'expérience a été enregistrée dans MLflow.")


Tracking URI: http://127.0.0.1:5000
Run info: <RunInfo: artifact_uri='mlflow-artifacts:/0/ed7f8a8c9e714288ab727f1bd55ddde6/artifacts', end_time=None, experiment_id='0', lifecycle_stage='active', run_id='ed7f8a8c9e714288ab727f1bd55ddde6', run_name='test_run', run_uuid='ed7f8a8c9e714288ab727f1bd55ddde6', start_time=1737030806377, status='RUNNING', user_id='artbo'>
Query: Angular project of Game Uno

Top 5 most similar documents:
{'name': 'angular-starter', 'full_name': 'wlucha/angular-starter', 'html_url': 'https://github.com/wlucha/angular-starter', 'description': ':rocket: Angular 18 Starter with Storybook, Transloco, Jest, Cypress, Docker, ESLint, Material & Prettier :rocket:', 'readme': '![81993396-d5142b00-9645-11ea-995f-98342b7d5c8f](https://user-images.githubusercontent.com/7531596/128626297-df86020b-1cdc-43b5-a692-6c4c45534ec1.png)\n\n> The Angular starter / boilerplate project to start a new enterprise project.\n\n✅ [Angular 18](https://angular.dev/)  \n✅ [Angular Material](http