In [1]:
from sentence_transformers import SentenceTransformer
import chromadb
import pandas as pd

df = pd.read_csv("../../data/processed/tickets_clean.csv")

df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,subject,body,answer,type,queue,priority,language,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,text,text_length,clean_text,clean_length
0,Unvorhergesehener Absturz der Datenanalyse-Pla...,Die Datenanalyse-Plattform brach unerwartet ab...,Ich werde Ihnen bei der Lösung des Problems he...,Incident,General Inquiry,low,de,Crash,Technical,Bug,Hardware,Resolution,Outage,Documentation,,Unvorhergesehener Absturz der Datenanalyse-Pla...,305,unvorhergesehener absturz datenanalyseplattfor...,205
1,Customer Support Inquiry,Seeking information on digital strategies that...,We offer a variety of digital strategies and s...,Request,Customer Service,medium,en,Feedback,Sales,IT,Tech Support,,,,,Customer Support Inquiry Seeking information o...,250,customer support inquiry seeking information d...,183
2,Data Analytics for Investment,I am contacting you to request information on ...,I am here to assist you with data analytics to...,Request,Customer Service,medium,en,Technical,Product,Guidance,Documentation,Performance,Feature,,,Data Analytics for Investment I am contacting ...,726,data analytics investment contacting request i...,545
3,Krankenhaus-Dienstleistung-Problem,Ein Medien-Daten-Sperrverhalten trat aufgrund ...,Zurück zur E-Mail-Beschwerde über den Sperrver...,Incident,Customer Service,high,de,Security,Breach,Login,Maintenance,Incident,Resolution,Feedback,,Krankenhaus-Dienstleistung-Problem Ein Medien-...,256,krankenhausdienstleistungproblem mediendatensp...,204
4,Security,"Dear Customer Support, I am reaching out to in...","Dear [name], we take the security of medical d...",Request,Customer Service,medium,en,Security,Customer,Compliance,Breach,Documentation,Guidance,,,"Security Dear Customer Support, I am reaching ...",684,security dear customer support reaching inquir...,469


In [2]:

from sklearn.preprocessing import normalize
# 1. Chargement du modèle multilingue
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

text=df["clean_text"].tolist()

embeddings = model.encode(text)
embeddings_norm = normalize(embeddings)
# 3. Indexation dans ChromaDB
client = chromadb.PersistentClient(path="../../data/chroma_db")
collection = client.get_or_create_collection(name="it_support_tickets")

# Ajout des documents et vecteurs
max_batch = 5000
for i in range(0, len(text), max_batch):
    batch_texts = text[i:i+max_batch]
    batch_embeddings = embeddings_norm[i:i+max_batch]
    batch_ids = [str(j) for j in df.index[i:i+max_batch]]
    
    collection.add(
        embeddings=batch_embeddings.tolist(),
        documents=batch_texts,
        ids=batch_ids
    )
    
    print(f"Batch {i} à {i+len(batch_texts)} indexé ✅")

print("Indexation complète terminée ✅")

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 838.01it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batch 0 à 5000 indexé ✅
Batch 5000 à 10000 indexé ✅
Batch 10000 à 15000 indexé ✅
Batch 15000 à 20000 indexé ✅
Indexation complète terminée ✅


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

df = pd.read_csv("../../data/processed/tickets_clean.csv")


le = LabelEncoder()

y = le.fit_transform(df["type"]) 


X_train, X_test, y_train, y_test = train_test_split(
    embeddings_norm, 
    y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# 4. Entraînement
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

# 5. Évaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

      Change       0.96      0.55      0.70       415
    Incident       0.69      0.96      0.80      1596
     Problem       0.77      0.20      0.32       837
     Request       0.86      0.99      0.92      1152

    accuracy                           0.77      4000
   macro avg       0.82      0.68      0.69      4000
weighted avg       0.78      0.77      0.73      4000



In [10]:
joblib.dump(clf, "ticket_classifier_rf.pkl")
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']