In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-initiale/job_data_cleaned_final.csv


In [31]:
import pandas as pd
import xgboost as xgb
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [32]:
# === 1. Charger la dataset ===
df = pd.read_csv('/kaggle/input/dataset-initiale/job_data_cleaned_final.csv')
# === 2. Supprimer les colonnes inutiles ===
df = df.drop(columns=['Description', 'Location', 'Date', 'Company', 'URL'])

# === 3. Supprimer les lignes sans salaire ===
df = df.dropna(subset=['Salary'])

# === 4. Supprimer les lignes avec Job Title manquant ===
df = df[df['Job Title'].notnull()]

In [33]:
df.head()

Unnamed: 0,Job Title,Salary,Skills
90,Data Engineer,195000.0,
101,Data Engineer,125000.0,
146,Data Analyst,111175.0,"python, sql"
155,Data Scientist,112500.0,python
220,Senior Data Engineer,325000.0,"aws, bash, docker, hadoop, java, jira, kafka, ..."


In [6]:
# === 5. One-hot encoding pour Job Title ===
job_title_dummies = pd.get_dummies(df['Job Title'], prefix='jobtitle')

In [8]:
df.head()

Unnamed: 0,Job Title,Salary,Skills
90,Data Engineer,195000.0,
101,Data Engineer,125000.0,
146,Data Analyst,111175.0,"python, sql"
155,Data Scientist,112500.0,python
220,Senior Data Engineer,325000.0,"aws, bash, docker, hadoop, java, jira, kafka, ..."


In [34]:
# === 5. Préparer la liste complète des skills ===
all_skills = {
    '', 'airflow', 'airtable', 'alteryx', 'angular', 'angular.js', 'ansible', 'apl', 'arch',
    'asana', 'asp.net', 'asp.net core', 'asp.netcore', 'assembly', 'atlassian', 'aurora', 'aws',
    'azure', 'bash', 'bigquery', 'bitbucket', 'blazor', 'c', 'c#', 'c++', 'capacitor', 'cassandra',
    'centos', 'chainer', 'chef', 'clickup', 'clojure', 'cobol', 'codecommit', 'cognos', 'colocation',
    'confluence', 'cordova', 'couchbase', 'couchdb', 'crystal', 'css', 'dart', 'databricks', 'datarobot',
    'dax', 'db2', 'debian', 'delphi', 'deno', 'digitalocean', 'dingtalk', 'django', 'dlib', 'docker',
    'dplyr', 'drupal', 'dynamodb', 'elasticsearch', 'electron', 'elixir', 'ember.js', 'erlang',
    'esquisse', 'excel', 'express', 'f#', 'fastapi', 'fastify', 'fedora', 'firebase', 'firestore',
    'flask', 'flow', 'flutter', 'fortran', 'gatsby', 'gcp', 'gdpr', 'ggplot2', 'git', 'github',
    'gitlab', 'go', 'golang', 'google chat', 'graphql', 'groovy', 'gtx', 'hadoop', 'haskell',
    'heroku', 'homebrew', 'html', 'hugging face', 'huggingface', 'ibm cloud', 'ionic', 'java',
    'javascript', 'jenkins', 'jira', 'jquery', 'julia', 'jupyter', 'kafka', 'kali', 'keras',
    'kotlin', 'kubernetes', 'laravel', 'linode', 'linux', 'lisp', 'looker', 'lua', 'macos',
    'mariadb', 'matlab', 'matplotlib', 'mattermost', 'microsoft lists', 'microsoft teams',
    'microstrategy', 'mlpack', 'mlr', 'monday.com', 'mongo', 'mongodb', 'ms access', 'msaccess',
    'mxnet', 'mysql', 'neo4j', 'next.js', 'nltk', 'no-sql', 'node', 'node.js', 'nosql', 'notion',
    'npm', 'nuix', 'numpy', 'nuxt.js', 'objective-c', 'ocaml', 'opencv', 'openstack', 'oracle',
    'outlook', 'ovh', 'pandas', 'pascal', 'perl', 'phoenix', 'php', 'planner', 'play framework',
    'plotly', 'postgresql', 'power bi', 'powerbi', 'powerpoint', 'powershell', 'pulumi', 'puppet',
    'pyspark', 'python', 'pytorch', 'qlik', 'qt', 'r', 'react', 'react.js', 'redhat', 'redis',
    'redshift', 'ringcentral', 'rocketchat', 'rshiny', 'ruby', 'ruby on rails', 'rubyon rails',
    'rust', 'sap', 'sas', 'sass', 'scala', 'scikit-learn', 'seaborn', 'selenium', 'sharepoint',
    'sheets', 'shell', 'shogun', 'slack', 'smartsheet', 'snowflake', 'solidity', 'spark', 'splunk',
    'spreadsheet', 'spring'
}
all_skills = {s.strip().lower() for s in all_skills if s.strip()}


In [35]:
# === 6. One-hot encoding pour Skills (version optimisée) ===
df['Skills'] = df['Skills'].fillna('')

skill_dict = {}
for skill in all_skills:
    skill_dict[f'skill_{skill}'] = df['Skills'].str.contains(fr'\b{skill}\b', case=False, regex=True).astype(int)
skill_features = pd.DataFrame(skill_dict, index=df.index)


In [36]:
# === 7. Fusionner les features ===
df_final = pd.concat([df, job_title_dummies, skill_features], axis=1)
df_final = df_final.drop(columns=['Job Title', 'Skills'])

# === Afficher la première ligne après one-hot encoding ===
print("Première ligne du dataset après one-hot encoding:")
print(df_final.iloc[0])

Première ligne du dataset après one-hot encoding:
Salary                       195000.0
jobtitle_Business Analyst       False
jobtitle_Cloud Engineer         False
jobtitle_Data Analyst           False
jobtitle_Data Engineer           True
                               ...   
skill_seaborn                       0
skill_matplotlib                    0
skill_mysql                         0
skill_airflow                       0
skill_fedora                        0
Name: 90, Length: 214, dtype: object


In [37]:
# Convertir toutes les colonnes booléennes en 0/1
bool_cols = df_final.select_dtypes(include=['bool']).columns
df_final[bool_cols] = df_final[bool_cols].astype(int)


In [38]:
print("Première ligne du dataset après one-hot encoding:")
print(df_final.iloc[0])

Première ligne du dataset après one-hot encoding:
Salary                       195000.0
jobtitle_Business Analyst         0.0
jobtitle_Cloud Engineer           0.0
jobtitle_Data Analyst             0.0
jobtitle_Data Engineer            1.0
                               ...   
skill_seaborn                     0.0
skill_matplotlib                  0.0
skill_mysql                       0.0
skill_airflow                     0.0
skill_fedora                      0.0
Name: 90, Length: 214, dtype: float64


In [39]:

# === 8. Normaliser le salaire (Min-Max Scaling) ===
salary_min = df_final['Salary'].min()
salary_max = df_final['Salary'].max()
df_final['Salary_Norm'] = (df_final['Salary'] - salary_min) / (salary_max - salary_min)


In [40]:
# === 9. Préparer X et y ===
X = df_final.drop(columns=['Salary', 'Salary_Norm'])
y = df_final['Salary_Norm']

In [42]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb
import joblib

# Paramètres optimaux trouvés
best_params = {
    'learning_rate': 0.1,
    'max_depth': 9,
    'n_estimators': 100,
    'subsample': 0.7,
    'random_state': 42
}

# Train/test split identique au Deep Learning
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Instanciation et entraînement
xgb_model = xgb.XGBRegressor(**best_params)
xgb_model.fit(X_train, y_train)

# Prédictions sur le jeu de validation
y_pred = xgb_model.predict(X_val)

# Évaluation
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred))
r2_xgb = r2_score(y_val, y_pred)

print(f"\n✅ XGBoost - Validation RMSE : {rmse_xgb:.2f}")
print(f"✅ XGBoost - Validation R²   : {r2_xgb:.2f}")

# Sauvegarde du modèle
joblib.dump(xgb_model, 'xgb_salary_model.pkl')





✅ XGBoost - Validation RMSE : 0.04
✅ XGBoost - Validation R²   : 0.29


['xgb_salary_model.pkl']

In [61]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# === 0. Vérification GPU ===
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

# === 1. Préparation des données ===
X = df_final.drop(columns=['Salary'])
y = df_final['Salary'].values  # Plus de normalisation ici

# Split identique à XGBoost
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Standardisation des features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# === 2. Modèle Deep Learning ===
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Prédiction brute du salaire
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='mean_squared_error',
    metrics=[keras.metrics.RootMeanSquaredError()]
)

# === 3. Callbacks ===
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    'best_model.keras',
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

early_stop = keras.callbacks.EarlyStopping(
    patience=10,
    restore_best_weights=True,
    verbose=1
)

# === 4. Entraînement sur GPU ===
with tf.device('/GPU:0'):
    history = model.fit(
        X_train_scaled, y_train,
        validation_data=(X_val_scaled, y_val),
        epochs=100,
        batch_size=32,
        callbacks=[early_stop, checkpoint_cb],
        verbose=1
    )

# === 5. Évaluation ===
y_pred = model.predict(X_val_scaled).flatten()

rmse_dl = np.sqrt(mean_squared_error(y_val, y_pred))
r2_dl = r2_score(y_val, y_pred)

print(f"\n✅ Deep Learning - Validation RMSE : {rmse_dl:.2f}")
print(f"✅ Deep Learning - Validation R²   : {r2_dl:.2f}")

# === 6. Sauvegarde du modèle et scaler ===
model.save('final_deep_learning_model.h5')
joblib.dump(scaler, 'feature_scaler.pkl')


# === 7. Exemples de prédictions ===
print("\nExemples de salaires DL prédits vs réels :")
for pred, real in zip(y_pred[:5], y_val[:5]):
    print(f"Prévu : {pred:.2f} | Réel : {real:.2f}")


TensorFlow version: 2.18.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 10983374848.0000 - root_mean_squared_error: 102732.3359
Epoch 1: val_loss improved from inf to 465300320.00000, saving model to best_model.keras
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 10974416896.0000 - root_mean_squared_error: 102685.4688 - val_loss: 465300320.0000 - val_root_mean_squared_error: 21570.8203
Epoch 2/100
[1m612/620[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 542497984.0000 - root_mean_squared_error: 23276.6074
Epoch 2: val_loss improved from 465300320.00000 to 272091040.00000, saving model to best_model.keras
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 541791360.0000 - root_mean_squared_error: 23261.2988 - val_loss: 272091040.0000 - val_root_mean_squared_error: 16495.1816
Epoch 3/100
[1m610/620[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 399977952.0000 

In [7]:
import joblib
import tensorflow as tf
import pandas as pd
import numpy as np

# === Chargement scaler avec joblib ===
scaler = joblib.load('/kaggle/input/test_estimation/tensorflow2/default/1/feature_scaler (1).pkl')

# Chargement modèle Keras
model = tf.keras.models.load_model('/kaggle/input/test_estimation/tensorflow2/default/1/final_deep_learning_model.h5')

# === 2. Liste complète des skills ===
all_skills = {
    '', 'airflow', 'airtable', 'alteryx', 'angular', 'angular.js', 'ansible', 'apl', 'arch',
    'asana', 'asp.net', 'asp.net core', 'asp.netcore', 'assembly', 'atlassian', 'aurora', 'aws',
    'azure', 'bash', 'bigquery', 'bitbucket', 'blazor', 'c', 'c#', 'c++', 'capacitor', 'cassandra',
    'centos', 'chainer', 'chef', 'clickup', 'clojure', 'cobol', 'codecommit', 'cognos', 'colocation',
    'confluence', 'cordova', 'couchbase', 'couchdb', 'crystal', 'css', 'dart', 'databricks', 'datarobot',
    'dax', 'db2', 'debian', 'delphi', 'deno', 'digitalocean', 'dingtalk', 'django', 'dlib', 'docker',
    'dplyr', 'drupal', 'dynamodb', 'elasticsearch', 'electron', 'elixir', 'ember.js', 'erlang',
    'esquisse', 'excel', 'express', 'f#', 'fastapi', 'fastify', 'fedora', 'firebase', 'firestore',
    'flask', 'flow', 'flutter', 'fortran', 'gatsby', 'gcp', 'gdpr', 'ggplot2', 'git', 'github',
    'gitlab', 'go', 'golang', 'google chat', 'graphql', 'groovy', 'gtx', 'hadoop', 'haskell',
    'heroku', 'homebrew', 'html', 'hugging face', 'huggingface', 'ibm cloud', 'ionic', 'java',
    'javascript', 'jenkins', 'jira', 'jquery', 'julia', 'jupyter', 'kafka', 'kali', 'keras',
    'kotlin', 'kubernetes', 'laravel', 'linode', 'linux', 'lisp', 'looker', 'lua', 'macos',
    'mariadb', 'matlab', 'matplotlib', 'mattermost', 'microsoft lists', 'microsoft teams',
    'microstrategy', 'mlpack', 'mlr', 'monday.com', 'mongo', 'mongodb', 'ms access', 'msaccess',
    'mxnet', 'mysql', 'neo4j', 'next.js', 'nltk', 'no-sql', 'node', 'node.js', 'nosql', 'notion',
    'npm', 'nuix', 'numpy', 'nuxt.js', 'objective-c', 'ocaml', 'opencv', 'openstack', 'oracle',
    'outlook', 'ovh', 'pandas', 'pascal', 'perl', 'phoenix', 'php', 'planner', 'play framework',
    'plotly', 'postgresql', 'power bi', 'powerbi', 'powerpoint', 'powershell', 'pulumi', 'puppet',
    'pyspark', 'python', 'pytorch', 'qlik', 'qt', 'r', 'react', 'react.js', 'redhat', 'redis',
    'redshift', 'ringcentral', 'rocketchat', 'rshiny', 'ruby', 'ruby on rails', 'rubyon rails',
    'rust', 'sap', 'sas', 'sass', 'scala', 'scikit-learn', 'seaborn', 'selenium', 'sharepoint',
    'sheets', 'shell', 'shogun', 'slack', 'smartsheet', 'snowflake', 'solidity', 'spark', 'splunk',
    'spreadsheet', 'spring'
}
all_skills = {s.strip().lower() for s in all_skills if s.strip()}

# === 3. Définir les entrées utilisateur ===
job_title_input = "Data Scientist"
skills_input = "python, sql, machine learning"

# === 4. Get the actual feature names the scaler was trained on ===
# These should match exactly what was used during training
scaler_features = scaler.feature_names_in_

# === 5. Create a DataFrame with all expected features initialized to 0 ===
X_input = pd.DataFrame(0, index=[0], columns=scaler_features)

# === 6. Set the job title feature ===
job_title_col = f"jobtitle_{job_title_input}"
if job_title_col in scaler_features:
    X_input[job_title_col] = 1

# === 7. Set the skill features ===
skills_list = [s.strip().lower() for s in skills_input.split(',')]
for skill in skills_list:
    skill_col = f"skill_{skill}"
    if skill_col in scaler_features:
        X_input[skill_col] = 1

# === 8. Standardisation + Prédiction ===
X_input_scaled = scaler.transform(X_input)
predicted_salary = model.predict(X_input_scaled)[0][0]

# === 9. Affichage résultat ===
print("\n=== RÉSULTAT DE LA PRÉDICTION ===")
print(f"💼 Job Title        : {job_title_input}")
print(f"🛠️  Compétences      : {', '.join(skills_list)}")
print(f"💰 Salaire estimé   : {predicted_salary:,.2f} €")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step

=== RÉSULTAT DE LA PRÉDICTION ===
💼 Job Title        : Data Scientist
🛠️  Compétences      : python, sql, machine learning
💰 Salaire estimé   : 33,908.08 €
