In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# 📌 Step 1: Load and prepare data
df = pd.read_csv("/kaggle/input/dataset-initiale/job_data_cleaned_final.csv")

# Parse and clean skills
df["skill_list"] = df["Skills"].apply(lambda x: [s.strip().lower() for s in str(x).split(',')] if pd.notnull(x) else [])
df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Job Title,Description,Location,Date,Company,Salary,URL,Skills,skill_list
0,Data Analyst,,Serbia,25-09-2023,Cryptology,,,"excel, power bi, python, sql, tableau","[excel, power bi, python, sql, tableau]"
1,Data Analyst,,United States,02-03-2023,Point32Health,,http://tuftshealthplan.com/,"excel, sas, sql","[excel, sas, sql]"
2,Data Analyst,,United States,26-04-2023,Apex Systems,,,"azure, databricks, jira, oracle, power bi, pyt...","[azure, databricks, jira, oracle, power bi, py..."
3,Data Engineer,,Canada,25-05-2023,ODAIA Intelligence Inc.,,http://www.odaia.ai/,"aws, flow, notion, python, sql, word","[aws, flow, notion, python, sql, word]"
4,Data Engineer,,Germany,26-05-2023,DAHMEN Personalservice GmbH,,,"java, python, sql","[java, python, sql]"


In [2]:
# Count non-null skill lists
non_null_count = df['Skills'].notna().sum()

print(f"Number of rows with non-null skills: {non_null_count}")
print(f"Total rows in dataset: {len(df)}")
print(f"Percentage with skills: {non_null_count/len(df)*100:.2f}%")

Number of rows with non-null skills: 670364
Total rows in dataset: 787686
Percentage with skills: 85.11%


In [3]:
from sklearn.preprocessing import MultiLabelBinarizer
#  Multi-hot encoding
mlb = MultiLabelBinarizer()
X  = mlb.fit_transform(df["skill_list"])
skills = mlb.classes_
len(skills)

252

In [4]:
import numpy as np

def generate_input_output(X, hide_fraction=0.3):
    X_input = X.copy()
    for i in range(X.shape[0]):
        idx = np.where(X[i] == 1)[0]
        hide_n = int(len(idx) * hide_fraction)
        hide_idx = np.random.choice(idx, hide_n, replace=False)
        X_input[i, hide_idx] = 0  # hide some known skills
    return X_input, X  # input = partial, output = full

X_input, y = generate_input_output(X)


In [5]:
from sklearn.model_selection import train_test_split
X_input_train, X_input_test, y_train, y_test = train_test_split(X_input, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

lr = OneVsRestClassifier(LogisticRegression(solver='liblinear', max_iter=1000))
lr.fit(X_input_train, y_train)

y_pred_lr = lr.predict(X_input_test)

print("📊 Logistic Regression Results:")
print("Precision:", precision_score(y_test, y_pred_lr, average='micro'))
print("Recall:", recall_score(y_test, y_pred_lr, average='micro'))
print("F1-score:", f1_score(y_test, y_pred_lr, average='micro'))




📊 Logistic Regression Results:
Precision: 0.971590486747829
Recall: 0.8350708043754826
F1-score: 0.8981726162822831


In [7]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score, f1_score

knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X_input_train)

_, indices = knn.kneighbors(X_input_test)

y_pred_knn = np.array([y_train[i].mean(axis=0) > 0.5 for i in indices]).astype(int)

print("📍 k-NN (cosine) Results:")
print("Precision:", precision_score(y_test, y_pred_knn, average='micro'))
print("Recall:", recall_score(y_test, y_pred_knn, average='micro'))
print("F1-score:", f1_score(y_test, y_pred_knn, average='micro'))


📍 k-NN (cosine) Results:
Precision: 0.8556648821804593
Recall: 0.7931631695027305
F1-score: 0.8232294074444062


In [8]:
import joblib

# Sauvegarde des modèles
joblib.dump(lr, "logistic_model.pkl")
joblib.dump(knn, "knn_model.pkl")
joblib.dump(y_train, "y_train_data.pkl")

# Sauvegarde du binariseur
joblib.dump(mlb, "skill_label_binarizer.pkl")

print("✅ Modèles sauvegardés avec succès")


✅ Modèles sauvegardés avec succès


In [9]:
def recommend_skills_logistic(input_skills, top_k=5):
    valid_skills = [s for s in input_skills if s in mlb.classes_]
    if not valid_skills:
        return []

    input_vec = mlb.transform([valid_skills])
    preds = lr.predict_proba(input_vec)[0]  # (252,) probas pour chaque compétence

    # Masquer les compétences déjà connues
    preds[input_vec[0] == 1] = 0

    top_indices = preds.argsort()[-top_k:][::-1]
    return [mlb.classes_[i] for i in top_indices]
user_input = ["java"]
print("\n📊 Logistic Regression Prediction:")
print(recommend_skills_logistic(user_input))


📊 Logistic Regression Prediction:
['python', 'sql', 'scala', 'spring', 'aws']


In [10]:
def recommend_skills_knn(input_skills, top_k=5):
    valid_skills = [s for s in input_skills if s in mlb.classes_]
    if not valid_skills:
        return []

    input_vec = mlb.transform([valid_skills])
    distances, indices = knn.kneighbors(input_vec)

    neighbor_skills = y_train[indices[0]].mean(axis=0)  # Moyenne "score"
    neighbor_skills[input_vec[0] == 1] = 0  # Masquer les connues

    top_indices = neighbor_skills.argsort()[-top_k:][::-1]
    return [mlb.classes_[i] for i in top_indices]
user_input = ["java"]
print("\n📍 KNN Prediction:")
print(recommend_skills_knn(user_input))


📍 KNN Prediction:
['zoom', 'ggplot2', 'homebrew', 'heroku', 'haskell']
