In [30]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv("/content/job_fit.csv")
df.head()
print("\nColonnes :", df.columns.tolist())
# Normalisation
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
nb_dup = df.duplicated().sum()
if nb_dup > 0:
    df = df.drop_duplicates()
    print(f" {nb_dup} lignes dupliquées supprimées.")
else:
    print(" Aucune ligne dupliquée trouvée.")

# Les valeurs manquantes
print("\n Valeurs manquantes avant nettoyage :")
print(df.isnull().sum())
df = df.fillna('')
# Fonction pour nettoyer le texte
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
for col in ['required_skills', 'candidate_skills', 'degree']:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

# Conversion des années d’expérience en numérique
if 'years_experience' in df.columns:
    df['years_experience'] = pd.to_numeric(df['years_experience'], errors='coerce')
    df['years_experience'] = df['years_experience'].fillna(0)
    # suppression EXP > 60 ans d’expérience)
    df.loc[df['years_experience'] > 60, 'years_experience'] = 60

if 'fit' not in df.columns:
    raise ValueError(" La colonne 'fit' est absente du dataset.")

df['fit'] = df['fit'].astype(str).str.lower().str.strip()
df['fit'] = df['fit'].replace({
    'yes': 1, 'no': 0,
    'good fit': 1, 'not fit': 0,
    'true': 1, 'false': 0,
    '1': 1, '0': 0
})
df['fit'] = pd.to_numeric(df['fit'], errors='coerce').fillna(0)
df['fit'] = (df['fit'] >= 0.5).astype(int)

# Suppression des lignes vides essentielles
df = df[(df['required_skills'] != '') & (df['candidate_skills'] != '')]

#  Rapport post-nettoyage
print("\n Rapport après nettoyage :")
print("Nombre de lignes :", len(df))
print("Valeurs manquantes :", df.isnull().sum().sum())
print("Exemple après nettoyage :")
print(df.head(3))

# Feature Engineering
df['text_features'] = df['required_skills'] + " " + df['candidate_skills']

# Transformation TF-IDF

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df['text_features'])
if 'years_experience' in df.columns:
    X = np.hstack((X_tfidf.toarray(), df[['years_experience']].values))
else:
    X = X_tfidf.toarray()

y = df['fit']

#  Séparation Train / Test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Entraînement du modèle

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

#  Évaluation du modèle
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print("\n Accuracy :", accuracy_score(y_test, y_pred))
print("\n Rapport de classification :\n", classification_report(y_test, y_pred))
print("\n Matrice de confusion :\n", confusion_matrix(y_test, y_pred))

#  Fonction de prédiction
def predict_job_fit(required_skills, candidate_skills, years_experience):
    combined = clean_text(required_skills + " " + candidate_skills)
    X_text = vectorizer.transform([combined])
    X_final = np.hstack((X_text.toarray(), np.array([[years_experience]])))
    prob = model.predict_proba(X_final)[0, 1]
    decision = 1 if prob >= 0.5 else 0
    return prob, " Good Fit" if decision == 1 else " Not Fit"


# Exemple : DevOps position
required = "Linux, Docker, Kubernetes, AWS, CI/CD"
candidate = "Python, Docker, Jenkins, AWS"
experience = 4

prob, decision = predict_job_fit(required, candidate, experience)
print(f"\n Exemple : DevOps Position")
print(f" Probability: {prob:.2f} | Decision: {decision}")



Colonnes : ['id', 'required_skills', 'candidate_skills', 'degree', 'years_experience', 'overlap', 'fit', 'probability', 'missing_skills']
 Aucune ligne dupliquée trouvée.

 Valeurs manquantes avant nettoyage :
id                  0
required_skills     0
candidate_skills    0
degree              0
years_experience    5
overlap             3
fit                 3
probability         4
missing_skills      0
dtype: int64

 Rapport après nettoyage :
Nombre de lignes : 200
Valeurs manquantes : 0
Exemple après nettoyage :
        id                                    required_skills  \
0  FIT0000                    airflow hadoop java linux react   
1  FIT0001  etl gcp git graphql kubernetes node js spark t...   
2  FIT0002   excel git java javascript linux react typescript   

                                    candidate_skills   degree  \
0                django etl gcp hadoop pytorch spark  masters   
1  aws docker etl excel git numpy pytorch python ...  masters   
2  ci cd git graphql j