In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Chargement du dataset
df = pd.read_csv('job_titles_classification_extended.csv')

#---Cleaning data set------------------------------------

# --- CONFIGURATION ---
# Taux de change (à ajuster au besoin)
TAUX_CHANGE_USD_MAD = 10.2 
# Seuil de fréquence pour regrouper les catégories rares (ex: 1%)
RARE_THRESHOLD = 0.01 


def convert_salary_to_mad(salary_str):
    """Calcule le salaire moyen et le convertit en MAD."""
    try:
        if pd.isna(salary_str): 
            return None
            
        clean_str = salary_str.replace('$', '').replace('k', '')
        parts = clean_str.split('-')
        
        # Calculer la moyenne en USD (milliers)
        avg_usd_k = (float(parts[0]) + float(parts[1])) / 2
        
        # Conversion finale (x1000 pour les 'k' et xTaux pour les MAD)
        salary_mad = avg_usd_k * 1000 * TAUX_CHANGE_USD_MAD
        
        return salary_mad
        
    except Exception:
        return None

df['salary_mad'] = df['salary_range'].apply(convert_salary_to_mad)

# Optionnel : Remplacer les valeurs manquantes par la médiane après conversion
df['salary_mad'].fillna(df['salary_mad'].median(), inplace=True) 

# Ajout du format DH
df['salary_dh_formatted'] = df['salary_mad'].apply(
    lambda x: f"{x:,.0f} DH" if pd.notna(x) else "N/A"
)



# A. Standardisation de base
categorical_cols = ['category', 'location', 'experience_level', 'remote', 'skills_required']
for col in categorical_cols:
    df[col] = df[col].str.strip().str.lower()
    
# B. Encodage Ordinal pour l'expérience
experience_order = {
    'entry': 0,
    'mid': 1,
    'senior': 2,
    'lead': 3
}
df['experience_level_encoded'] = df['experience_level'].map(experience_order)


# Encodage pour category



# C. Encodage Binaire pour 'remote'
df['remote_encoded'] = df['remote'].map({'yes': 1, 'no': 0})



# D. Regroupement des localisations rares En regroupant les villes qui apparaissent rarement sous l'étiquette 'other'
location_counts = df['location'].value_counts(normalize=True)
rare_locations = location_counts[location_counts < RARE_THRESHOLD].index
df['location_grouped'] = df['location'].apply(
    lambda x: 'other' if x in rare_locations else x
)


# 1. Uniformisation des chaînes de compétences
df['skills_cleaned'] = df['skills_required'].str.lower() 
df['skills_cleaned'] = df['skills_cleaned'].str.replace(', ', ',', regex=False)
df['skills_cleaned'] = df['skills_cleaned'].str.strip() 

# 2. Mappage des Acronymes
acronym_map = {
    'nlp': 'natural language processing',
    'ml': 'machine learning',
    'aws': 'amazon web services',
    'sql': 'structured query language',
    'pr': 'public relations',
    'ads': 'advertising',
    'ai' : 'Artificial intelligence'
}

for acro, full_form in acronym_map.items():
    # Le r'\b' assure que l'on remplace l'acronyme seul
    df['skills_cleaned'] = df['skills_cleaned'].str.replace(r'\b' + acro + r'\b', full_form, regex=True)

# Gérer les NaN (remplacer par une chaîne vide pour TF-IDF)
df['skills_cleaned'].fillna('', inplace=True)



# Créer le DataFrame final avec les colonnes utiles
df_ready = df[['job_title', 'category', 'location_grouped', 
               'salary_mad', 'experience_level_encoded', 'remote_encoded', 'skills_cleaned']]

# Afficher les premières lignes 
print("\n--- Aperçu du DataFrame Prêt pour la Modélisation ---")
print(df_ready.head(20))



--- Aperçu du DataFrame Prêt pour la Modélisation ---
                    job_title     category location_grouped  salary_mad  \
0           Backend Developer           it           berlin    510000.0   
1   Digital Marketing Analyst    marketing            cairo    816000.0   
2           Backend Developer           it            dubai    816000.0   
3                   Therapist   healthcare          toronto    510000.0   
4          Frontend Developer           it         new york    510000.0   
5                   Professor    education         new york    969000.0   
6                Risk Manager      finance            cairo    969000.0   
7            School Principal    education          toronto    510000.0   
8             DevOps Engineer           it           london    510000.0   
9                 Radiologist   healthcare            cairo    663000.0   
10     Medical Lab Technician   healthcare            tokyo    969000.0   
11           School Principal    education   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['salary_mad'].fillna(df['salary_mad'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['skills_cleaned'].fillna('', inplace=True)
