In [128]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings

warnings.filterwarnings('ignore')

import re
import datetime

from typing import List
import pandas as pd
from IPython.display import display
from datasets import (
    DatasetDict,
    Dataset
)
from peft import (
    LoraConfig,
    get_peft_model,
)
from transformers import (
    T5ForConditionalGeneration,
    Trainer,
    DataCollatorForSeq2Seq,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
)
import joblib

try:
    import tensorflow

    print("⚠️ TensorFlow encore présent")
except ImportError:
    print("✅ TensorFlow désinstallé")

try:
    import transformers
    import torch

    print(f"✅ Transformers {transformers.__version__}")
    print(f"✅ PyTorch {torch.__version__}")
except ImportError:
    print("❌ Transformers ou PyTorch manquant")


✅ TensorFlow désinstallé
✅ Transformers 4.57.3
✅ PyTorch 2.9.1


In [129]:
# Chargement dataset
csv_path = "../backend/app/data/raw/export_us_01.csv"

try:
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Fichier non trouvé: {csv_path}")
    df = pd.read_csv(csv_path, low_memory=False, sep=";", encoding="utf-8")
    print(f"CSV file chargé avec succès. Nombre d'échantillons chargés: {df.shape[0]}")
    display(df.head())
except FileNotFoundError as e:
    print(e)


CSV file chargé avec succès. Nombre d'échantillons chargés: 1118


Unnamed: 0,Issue Type,Key,Priority,Status,Summary,Created,Description,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,Story,CODEE01-35,,Cancelled,[eShop] [OUT OF SCOPE] Products recommendation...,27-03-2023,As a business \n\nI want to system supports pr...,,,,,
1,Story,CODEE01-91,,Cancelled,[Go-live] Activation of a B2B portal itself,27-03-2023,As a Tech \n\nI want to story \n\nSo that I ca...,,,,,
2,Story,CODEE01-90,,Cancelled,[Go-live] Activation of the integration with o...,27-03-2023,As a Tech \n\nI want to story \n\nSo that I ca...,,,,,
3,Story,CODEE01-55,,Cancelled,[Integration with CMS][OUT of the Scope]Integr...,27-03-2023,As a business \n\nI want to show to clients th...,,,,,
4,Story,CODEE01-65,,Cancelled,[General] Configuration of the Version Control...,27-03-2023,As a Tech \n\nI want to story \n\nSo that I ca...,,,,,


## Préprocessing


In [130]:
# Clean data
columns = ['Key', 'Created']
df.drop(columns=columns, inplace=True)

features = ['Issue Type', 'Summary', 'Description']
contentX = df[features].copy()
contentX.fillna("")

Unnamed: 0,Issue Type,Summary,Description
0,Story,[eShop] [OUT OF SCOPE] Products recommendation...,As a business \n\nI want to system supports pr...
1,Story,[Go-live] Activation of a B2B portal itself,As a Tech \n\nI want to story \n\nSo that I ca...
2,Story,[Go-live] Activation of the integration with o...,As a Tech \n\nI want to story \n\nSo that I ca...
3,Story,[Integration with CMS][OUT of the Scope]Integr...,As a business \n\nI want to show to clients th...
4,Story,[General] Configuration of the Version Control...,As a Tech \n\nI want to story \n\nSo that I ca...
...,...,...,...
1113,,,
1114,,,
1115,,,
1116,,,


In [133]:

def nettoyer_texte_description(text: str) -> str:
    text = text.strip()
    text = re.sub(r' +', ' ', text)  # espaces multiples
    text = re.sub(r'\n\s*\n+', '\n', text)  # supprime lignes vides multiples
    text = text.replace(r'\n\s*\n+', '')
    return text


# Nettoie et normalise le texte
def nettoyer_texte(texte):
    texte = re.sub(r"([.,!?'])", r" \1 ", texte)
    texte = re.sub(r"([-●'])", r" ", texte)
    return texte.strip()


def extract_acceptance_criteria(text: str) -> List[str]:
    text = text.replace("●", "-")  # Remplace les bullets non standard par "-"

    # Trouver la section "Acceptance Criteria"
    match = re.search(r'Acceptance Criteria(.*)', text, re.DOTALL | re.IGNORECASE)
    if not match:
        return []

    ac_section = match.group(1).strip()

    # Découper selon les puces commençant par "-"
    items = re.split(r'-\s*', ac_section)
    items = [i.strip() for i in items if i.strip()]

    return items


def safe_text(v):
    if isinstance(v, float):  # couvre NaN ou nombres
        return ""
    return str(v).strip()


def preprocess_issueType(raw_text: str) -> str:
    return raw_text


def preprocess_summary(raw_text: str) -> str:
    return raw_text


def preprocess_description(raw_text: str):
    text = nettoyer_texte_description(raw_text)

    # Résumé auto : première phrase "As a business..."
    summary_match = re.search(r"As a .*?[\.\n]", text, re.IGNORECASE)
    summary = summary_match.group(0).strip() if summary_match else ""

    # Description : la partie avant les critères d'acceptation
    description = re.split(r'Acceptance Criteria', text, flags=re.IGNORECASE)[0]
    description = nettoyer_texte_description(description)

    # Critères d'acceptation
    acceptance_criteria = extract_acceptance_criteria(text)
    acceptance_criterias = '\n - '.join(acceptance_criteria)
    return {
        'content_summary': summary,
        'description': description,
        'acceptance_criteria': acceptance_criterias,
    }


resp = []

for i, phrase in contentX.iterrows():
    rawIssueType = safe_text(phrase['Issue Type'])
    if rawIssueType == "":
        rawIssueType = "Story" # set default value
    rawSummary = safe_text(phrase['Summary'])
    if rawSummary == "":
        rawIssueType = "Empty" # set default value
    rawDescription = safe_text(phrase['Description'])
    # Skip empty info
    if rawDescription != "":
        issueType = preprocess_issueType(rawIssueType)
        summary = preprocess_summary(rawSummary)
        description = preprocess_description(rawDescription)
        resp.append(
            {
                "issue_type": nettoyer_texte(issueType),
                "summary": nettoyer_texte(rawSummary),
                "content_summary": nettoyer_texte(description['content_summary']),
                "description": nettoyer_texte(description['description']),
                "acceptance_criteria": nettoyer_texte(description['acceptance_criteria']),
            })

print(f"Size : {len(resp)}")


Size : 877


In [134]:

for i, texte in enumerate(resp[:5]):
    print(f"{i}.\n{texte}\n")


0.
{'issue_type': 'Story', 'summary': '[eShop] [OUT OF SCOPE] Products recommendation (AI)', 'content_summary': 'As a business', 'description': 'As a business \nI want to system supports products recommendation based on AI \nSo that I can provide my clients better experience and the most relevant suggestions', 'acceptance_criteria': 'Universal custom LWC component with the list of recommended products that can be placed to any Community page . \n   The Custom component will use the Einstein API for getting insights from the AI . \n   The Einstein need to be enabled in the B2B portal once system will be launched and enough data(orders ,  cartItems) will be created there . \n   Einstein will be turned on without custom LWC component'}

1.
{'issue_type': 'Story', 'summary': '[Go live] Activation of a B2B portal itself', 'content_summary': 'As a Tech', 'description': 'As a Tech \nI want to story \nSo that I can N/A', 'acceptance_criteria': ''}

2.
{'issue_type': 'Story', 'summary': '[Go li

In [135]:
# Define device

use_mps = False
use_fp16 = False
if torch.cuda.is_available():
    device = torch.device("cuda")
    use_fp16 = True
    print(f"GPU NVIDIA détecté: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    use_mps = True
    use_fp16 = False
    print("GPU Apple Silicon (MPS) détecté")
else:
    device = torch.device("cpu")
    use_mps = False
    use_fp16 = False
    print("CPU détecté")

print(f"Model defined {device}")

GPU Apple Silicon (MPS) détecté
Model defined mps


In [136]:
model_name = "google/flan-t5-base"

autoTokenizerGen = AutoTokenizer.from_pretrained(model_name)
modelGen = AutoModelForSeq2SeqLM.from_pretrained(model_name)
modelGen.to(device)

def normalize_tags(tag_str):
    # split by comma, strip, lowercase, replace spaces by hyphen, remove duplicates
    tags = [t.strip().lower().replace(" ", "-") for t in tag_str.split(",") if t.strip()]
    seen = []
    for t in tags:
        if t not in seen:
            seen.append(t)
    return seen


def generate_client_sentence(target, max_length=128):
    num_tags = 8
    prompt = (
        f"""Generate {num_tags} relevant tags for this description.
        Tags should be lowercase, comma-separated, and include technologies, frameworks, and project type.

        Project description: {target}

        Tags:"""
    )

    inputsGen = autoTokenizerGen(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    ).to(device)

    with torch.no_grad():
        outputsGen = modelGen.generate(
            **inputsGen,
            max_length=max_length,
            num_beams=5,
            no_repeat_ngram_size=3,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            early_stopping=True,
        )

    tags = autoTokenizerGen.decode(outputsGen[0], skip_special_tokens=True)

    # Clean up output
    tags = tags.strip()
    if not tags:
        return "web-app, software, development"  # Fallback

    return {
        "target": target,
        "targs": tags,
    }


'(ProtocolError('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')), '(Request ID: c78c29a3-fb56-48f1-beff-9dfefaff7db1)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


In [137]:
content_data = []
for t in resp:
    data = generate_client_sentence(t['description'])
    content_data.append({
        "input": data['targs'],
        "output": data['target'],
    })


In [119]:
for i, cd in enumerate(content_data[:5]):
    print(f"{i}.\Input: \n{cd["input"]}\nOutput: \n{cd['output']}\n")

0.\Input: 
technologies, frameworks, project type
Output: 
As a business 
I want to system supports products recommendation based on AI 
So that I can provide my clients better experience and the most relevant suggestions

1.\Input: 
project type
Output: 
As a Tech 
I want to story 
So that I can N/A

2.\Input: 
project type
Output: 
As a Tech 
I want to story 
So that I can N/A

3.\Input: 
technologies, frameworks, project type
Output: 
As a business 
I want to show to clients the same products related content at eShop and Web site 
So that I can have consistency and up to date products information on all platforms

4.\Input: 
project type
Output: 
As a Tech 
I want to story 
So that I can N/A



In [120]:
split = Dataset.from_list(content_data).train_test_split(test_size=0.1, seed=42)

dataset_dict = DatasetDict({
    "train": split["train"],
    "validation": split["test"]
})

print(len(dataset_dict["train"]))
print(len(dataset_dict["validation"]))

464
52


In [121]:
# Load the Tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    inputs = batch["input"]
    targets = batch["output"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized = dataset_dict.map(preprocess, batched=True, remove_columns=dataset_dict["train"].column_names)
tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map: 100%|██████████| 464/464 [00:00<00:00, 2445.11 examples/s]
Map: 100%|██████████| 52/52 [00:00<00:00, 4077.39 examples/s]


In [138]:
#
model = T5ForConditionalGeneration.from_pretrained(model_name)
lora_config = LoraConfig(
    r=16,  # defines the rank of the update matrices
    lora_alpha=32,  # scales the updates
    target_modules=["q", "v"],  # Adjust based on model architecture - attention projection modules
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"  # sequence-to-sequence task
)

model = get_peft_model(model, lora_config)
# Paramétres
EPOCHS = 25  #25
LEARNING_RATE = 5e-5

train_batch_size = 8 if use_mps else 4
eval_batch_size = 8 if use_mps else 4

training_args = TrainingArguments(
    output_dir="../target/t5_tag_generator",

    # Paramètres d'entraînement
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,

    # Paramètres essentiels selon le device
    fp16=use_fp16,  # True si GPU
    use_mps_device=use_mps,  # True si GPU mps détecté

    # Optimisation
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,

    logging_steps=10,
    eval_strategy="steps",
    eval_steps=500,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    processing_class=tokenizer,
    data_collator=data_collator
)

In [139]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [125]:
def generate_tags(query, model, tokenizer, max_length=512, num_beams=5):
    model.eval()
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(
        model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            decoder_start_token_id=tokenizer.pad_token_id  #  required for T5
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

R: 
R: 
R: As a business I want to show to clients the same products related content at eShop and Web site so that I can have consistency and up to date products information on all platforms. I want to show to clients the same products related content at eShop and Web site so that I can have consistency and up to date products information on all platforms. I want to show to clients the same products related content at eShop and Web site.
R: 


In [106]:
# Enregistrement du modèle
x = datetime.datetime.now()
x = x.strftime("%Y-%m-%d.%H:%M:%S")
#modelName = f"model_0_{x}.pkl"
modelName = f"model_0.pkl"

print(f"• enregistrement du modèle {modelName}")
joblib.dump(model, "../backend/models/" + modelName)
print("• Fin de l'enregistrement' du modèle")

~~~ Enregistrement du modèle ~~~
• enregistrement du modèle model_0_2025-12-14.16:31:17.pkl
• Fin de l'enregistrement' du modèle


In [140]:

modelPath = "../backend/models/" + modelName
loadedModel = joblib.load(modelPath)
loadedModel.to(device)
lastPrediction = generate_tags(
    "technologies, frameworks, project type",
    loadedModel,
    modelName)

print("Prédiction: " + lastPrediction)


FileNotFoundError: [Errno 2] No such file or directory: '../models/model_0_2025-12-14.16:31:17.pkl'