In [41]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
import nltk
from transformers import pipeline

In [42]:
df = pd.read_csv('Candidature.csv')
df.head()

Unnamed: 0,Id,Contenu,Réponse
0,1,"""Nous avons bien reçu votre offre de collabora...",Refus
1,2,"""Nous avons bien reçu votre offre de collabora...",Refus
2,3,"""\r\nBonjour Kevin,\r\n\r\nNous vous remercion...",Refus
3,4,"""Merci de l'intérêt porté à notre société.\r\n...",Refus
4,5,"""Nous avons bien reçu votre candidature pour l...",Refus


In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenization
    return text

for text in df['Contenu']:
    preprocessed_text = preprocess_text(text)
    df['Contenu'] = df['Contenu'].replace(text, preprocessed_text)



100%|██████████| 83/83 [00:00<00:00, 1739.92it/s]


Unnamed: 0,Id,Contenu,Réponse
0,1,nous avons bien reçu votre offre de collaborat...,Refus
1,2,nous avons bien reçu votre offre de collaborat...,Refus
2,3,bonjour kevin nous vous remercions pour votre ...,Refus
3,4,merci de lintérêt porté à notre société nous a...,Refus
4,5,nous avons bien reçu votre candidature pour le...,Refus
...,...,...,...
78,79,votre profil ne répond pas aux besoins actuels...,Refus
79,80,le poste étant pourvu nous clôturons le proces...,Refus
80,81,vous ne faites pas partie des candidats retenu...,Refus
81,82,votre profil na pas été retenu mais nous vous ...,Refus


In [30]:
classifier = pipeline("text-classification", model="camembert-base", tokenizer="camembert-base")

text = "Nous avons le plaisir de vous inviter à un entretien."
result = classifier(text)

print(result)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.5040055513381958}]


In [None]:
nlp = spacy.load("fr_core_news_sm")

def spacy_tokenizer(text):
    doc = nlp(text.lower())
    return "".join([token.text for token in doc if not token.is_stop and not token.is_punct])

df["Contenu_Lemmatise"] = df["Contenu"].apply(spacy_tokenizer)

# Exemple de dataset
X = df["Contenu_Lemmatise"]
y = df["Réponse"]

X_train, X_test, y_train, y_test =X[8:], X[:8],y[8:], y[:8]

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.875
Classification Report:
               precision    recall  f1-score   support

    En cours       0.00      0.00      0.00         1
       Refus       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
y_pred

array(['Refus', 'Refus', 'Refus', 'Refus', 'Refus', 'Refus', 'Refus',
       'Refus'], dtype=object)

In [39]:
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('french'))

# Initialiser le stemmer français
stemmer = FrenchStemmer()

# Fonction de nettoyage + racination
def stem_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Enlever la ponctuation
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Appliquer la racination
df["Contenu_Stem"] = df["Contenu"].apply(stem_text)

# Split et TF-IDF
X_train, X_test, y_train, y_test = train_test_split(df["Contenu_Stem"], df["Réponse"], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Entraînement
model = LogisticRegression()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)

# Évaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6470588235294118
Classification Report:
               precision    recall  f1-score   support

     Accepte       1.00      0.33      0.50         6
    En cours       1.00      0.60      0.75         5
       Refus       0.50      1.00      0.67         6

    accuracy                           0.65        17
   macro avg       0.83      0.64      0.64        17
weighted avg       0.82      0.65      0.63        17

