In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib # For saving the scaler

df = pd.read_csv('salaries.csv')

print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 1 ---")

# --- 1. Data Loading and Initial Cleaning ---
# Assuming 'df' DataFrame is already loaded from your CSV.
# If not, uncomment and update the line below with your data path:
# df = pd.read_csv('your_data.csv') # Make sure this points to your cleaned dataset

print("\n1. Initial Data Loading and Cleaning...")
# Remove duplicate rows
initial_rows = df.shape[0]
df.drop_duplicates(inplace=True)
print(f"Removed {initial_rows - df.shape[0]} duplicate rows. Remaining rows: {df.shape[0]}")

# Ensure 'remote_work_type' column exists or is created if needed
if 'remote_ratio' in df.columns and 'remote_work_type' not in df.columns:
    df['remote_work_type'] = df['remote_ratio'].apply(lambda x: 'Remote' if x == 1 else ('Hybrid' if x == 0.5 else 'On-site'))
    print("Created 'remote_work_type' based on 'remote_ratio'.")
elif 'remote_work_type' not in df.columns:
    print("Warning: 'remote_work_type' column not found. Please ensure it's created or exists in your dataset.")
    # Add explicit handling or pre-processing steps here if this warning appears unexpectedly.

print("\nStage 1: Imports and Initial Data Preparation Complete.")

--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 1 ---

1. Initial Data Loading and Cleaning...
Removed 6401 duplicate rows. Remaining rows: 10093
Created 'remote_work_type' based on 'remote_ratio'.

Stage 1: Imports and Initial Data Preparation Complete.


In [2]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# --- 2. Feature Engineering and Data Splitting ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 2 ---")

# Assuming 'df' DataFrame is ready from Stage 1
# For this example, let's re-create a similar dataframe as if Stage 1 was run
df = pd.read_csv('salaries.csv')
df.drop_duplicates(inplace=True)
if 'remote_ratio' in df.columns and 'remote_work_type' not in df.columns:
    df['remote_work_type'] = df['remote_ratio'].apply(lambda x: 'Remote' if x == 1 else ('Hybrid' if x == 0.5 else 'On-site'))

print("\n2. Preprocessing and Splitting Data...")

# Define features and target.
# Note: 'remote_ratio' is now replaced by the new 'remote_work_type'
# We drop 'salary' and 'salary_currency' as 'salary_in_usd' is our target.
features_to_drop = ['salary', 'salary_currency', 'remote_ratio']
df.drop(columns=features_to_drop, errors='ignore', inplace=True)

# Separate features (X) from the target variable (y)
X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']

# Log-transform the target variable to handle its right-skewed distribution.
# We'll need to reverse this transformation later for evaluation.
y_log = np.log1p(y)

# Identify categorical features for one-hot encoding.
# The `_get_numeric_data()` method is helpful for this.
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Apply one-hot encoding to the categorical features.
# 'drop_first=True' prevents multicollinearity and reduces the number of features.
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

print("Categorical features one-hot encoded.")
print(f"Original feature count: {len(X.columns)}")
print(f"Encoded feature count: {len(X_encoded.columns)}")
print("Data is now ready for model training.")

# Split the data into training and testing sets.
# We use a 80/20 split, which is a common practice.
X_train, X_test, y_train_log, y_test_log = train_test_split(X_encoded, y_log, test_size=0.2, random_state=42)

print("\nData has been successfully split into training and testing sets.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

print("\nStage 2: Feature Engineering and Data Splitting Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 2 ---

2. Preprocessing and Splitting Data...
Categorical features one-hot encoded.
Original feature count: 8
Encoded feature count: 326
Data is now ready for model training.

Data has been successfully split into training and testing sets.
Training set size: 8074 samples
Testing set size: 2019 samples

Stage 2: Feature Engineering and Data Splitting Complete.


In [3]:
# Import necessary libraries for this stage
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score
# Assuming 'X_train', 'X_test', 'y_train_log', 'y_test_log' are available from Stage 2.

# --- 3. Model Training and Evaluation ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 3 ---")
print("\n3. Training the XGBoost Model...")

# Initialize the XGBoost Regressor model.
# These hyperparameters are a good starting point. For a real project,
# you would tune these to optimize performance.
xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',  # Objective function for regression
    n_estimators=100,             # Number of boosting rounds (trees)
    learning_rate=0.1,            # Step size shrinkage
    max_depth=5,                  # Maximum depth of a tree
    random_state=42,
    n_jobs=-1                     # Use all available CPU cores
)

# Train the model on the training data.
# The model learns to predict the log-transformed salary.
xgbr.fit(X_train, y_train_log)
print("XGBoost model training complete.")

print("\n4. Making predictions and evaluating the model...")
# Make predictions on the unseen test data.
y_pred_log = xgbr.predict(X_test)

# Inverse-transform the predictions and the true values to the original USD scale.
# This makes the error metrics more interpretable.
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Calculate key regression metrics.
# MAE is calculated on the original scale, while R2 is calculated on the log scale
# to maintain consistency with the model's training objective.
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- Model Performance ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

# Get and display the feature importance scores.
# This provides valuable insight into which features are most influential
# in the model's predictions.
print("\n5. Analyzing Feature Importance...")
feature_importances = pd.Series(xgbr.feature_importances_, index=X_test.columns)
print("Top 10 Feature Importances:")
print(feature_importances.nlargest(10))

print("\nStage 3: Model Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 3 ---

3. Training the XGBoost Model...
XGBoost model training complete.

4. Making predictions and evaluating the model...

--- Model Performance ---
Mean Absolute Error (MAE): $43,318.56
R-squared (R²): 0.4587

5. Analyzing Feature Importance...
Top 10 Feature Importances:
employee_residence_US        0.214299
employee_residence_CA        0.075715
job_title_Data Analyst       0.060546
experience_level_SE          0.029433
company_location_IN          0.028841
company_location_US          0.022738
experience_level_EX          0.020179
company_location_DE          0.018491
employee_residence_MX        0.013683
job_title_Data Specialist    0.012417
dtype: float32

Stage 3: Model Training and Evaluation Complete.


In [6]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import uniform, randint

# Re-create the data preparation steps from Stage 2 and 3 for a self-contained example
# In a full notebook, these would be run in previous cells.
def load_and_preprocess_data():
    df = pd.read_csv('salaries.csv')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                'company_location', 'company_size', 'remote_ratio', 'work_year']
    X = df[features]
    y_original = df['salary_in_usd']
    y_log = np.log1p(y_original)

    X_encoded = pd.get_dummies(X, columns=X.select_dtypes(include='object').columns, drop_first=True)

    X_train, X_test, y_train_log, y_test_log = train_test_split(
        X_encoded, y_log, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train_log, y_test_log

# --- 5. Hyperparameter Tuning with Randomized Search ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 5 ---")
print("\n1. Starting Hyperparameter Tuning with Randomized Search...")

# Get the prepared data
X_train, X_test, y_train_log, y_test_log = load_and_preprocess_data()

# Define the XGBoost Regressor
xgbr = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the parameter distributions to sample from.
# We're trying a range of values for key hyperparameters.
param_dist = {
    'n_estimators': randint(100, 1000),      # Number of trees
    'learning_rate': uniform(0.01, 0.2),      # Step size shrinkage
    'max_depth': randint(3, 10),              # Maximum depth of a tree
    'min_child_weight': randint(1, 10),       # Minimum sum of instance weight needed in a child
    'gamma': uniform(0, 0.5),                 # Minimum loss reduction required to make a further partition
    'subsample': uniform(0.6, 0.4),           # Subsample ratio of the training instance
    'colsample_bytree': uniform(0.6, 0.4)     # Subsample ratio of columns when constructing each tree
}

# Set up RandomizedSearchCV
# n_iter=100 means we will try 100 different combinations of hyperparameters
random_search = RandomizedSearchCV(
    estimator=xgbr,
    param_distributions=param_dist,
    n_iter=100, # Number of parameter settings that are sampled
    scoring='neg_mean_absolute_error', # Use MAE as the scoring metric to minimize
    cv=5,       # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available CPU cores
)

# Run the search
random_search.fit(X_train, y_train_log)

print("\nHyperparameter tuning complete.")
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation score (MAE): {-random_search.best_score_:.2f}")

# 2. Training the final model with the best parameters
print("\n2. Training final model with best parameters...")
best_xgbr = random_search.best_estimator_
best_xgbr.fit(X_train, y_train_log)

# 3. Evaluating the tuned model
print("\n3. Making predictions and evaluating the tuned model...")
y_pred_log_tuned = best_xgbr.predict(X_test)
y_pred_tuned = np.expm1(y_pred_log_tuned)
y_test_original = np.expm1(y_test_log)

mae_tuned = mean_absolute_error(y_test_original, y_pred_tuned)
r2_tuned = r2_score(y_test_log, y_pred_log_tuned)

print("\n--- Tuned Model Performance ---")
print(f"Mean Absolute Error (MAE): ${mae_tuned:,.2f}")
print(f"R-squared (R²): {r2_tuned:.4f}")

print("\nStage 5: Hyperparameter Tuning Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 5 ---

1. Starting Hyperparameter Tuning with Randomized Search...
Fitting 5 folds for each of 100 candidates, totalling 500 fits

Hyperparameter tuning complete.
Best parameters found: {'colsample_bytree': np.float64(0.7704365900187764), 'gamma': np.float64(0.11128820878551526), 'learning_rate': np.float64(0.08933032039085037), 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 616, 'subsample': np.float64(0.8053304674769571)}
Best cross-validation score (MAE): 0.29

2. Training final model with best parameters...

3. Making predictions and evaluating the tuned model...

--- Tuned Model Performance ---
Mean Absolute Error (MAE): $43,077.81
R-squared (R²): 0.4636

Stage 5: Hyperparameter Tuning Complete.


In [7]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

def load_and_preprocess_data():
    """
    Loads and preprocesses the data, including a new feature engineering step.
    """
    print("Step 1: Loading and initial cleaning of the dataset...")
    df = pd.read_csv('salaries.csv')
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    print("Dataset loaded and cleaned.")

    # --- New Feature Engineering Step ---
    print("\nStep 2: Grouping high-cardinality features like job_title...")

    # We will create a new, simplified job title column.
    # Grouping logic: if a job title appears less than 20 times,
    # we'll categorize it as 'Other'.
    job_title_counts = df['job_title'].value_counts()
    rare_job_titles = job_title_counts[job_title_counts < 20].index

    df['job_title_grouped'] = df['job_title'].apply(
        lambda x: 'Other' if x in rare_job_titles else x
    )

    # We'll also do the same for company location.
    company_location_counts = df['company_location'].value_counts()
    rare_locations = company_location_counts[company_location_counts < 10].index
    df['company_location_grouped'] = df['company_location'].apply(
        lambda x: 'Other' if x in rare_locations else x
    )

    print(f"Original unique job titles: {len(df['job_title'].unique())}")
    print(f"Grouped unique job titles: {len(df['job_title_grouped'].unique())}")
    print(f"Original unique company locations: {len(df['company_location'].unique())}")
    print(f"Grouped unique company locations: {len(df['company_location_grouped'].unique())}")

    # Define features for the model using the new grouped columns
    features = ['experience_level', 'employment_type', 'job_title_grouped',
                'employee_residence', 'company_location_grouped',
                'company_size', 'remote_ratio', 'work_year']

    X = df[features]
    y_original = df['salary_in_usd']
    y_log = np.log1p(y_original)

    # One-hot encode the categorical features
    X_encoded = pd.get_dummies(X, columns=X.select_dtypes(include='object').columns, drop_first=True)

    # Splitting the data
    X_train, X_test, y_train_log, y_test_log = train_test_split(
        X_encoded, y_log, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train_log, y_test_log, y_original, df

# --- 6. Feature Engineering, Retraining and Evaluation ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 6 ---")
print("\n1. Running new feature engineering and preparing data...")

X_train, X_test, y_train_log, y_test_log, y_original, df = load_and_preprocess_data()

print("\n2. Training the XGBoost Model with engineered features...")
xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=616, # Using a good value from our tuning in Stage 5
    learning_rate=0.089,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

xgbr.fit(X_train, y_train_log)
print("Training complete.")

print("\n3. Making predictions and evaluating the new model...")
y_pred_log = xgbr.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- New Model Performance (with engineered features) ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nStage 6: Feature Engineering, Retraining and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 6 ---

1. Running new feature engineering and preparing data...
Step 1: Loading and initial cleaning of the dataset...
Dataset loaded and cleaned.

Step 2: Grouping high-cardinality features like job_title...
Original unique job titles: 155
Grouped unique job titles: 47
Original unique company locations: 77
Grouped unique company locations: 25

2. Training the XGBoost Model with engineered features...
Training complete.

3. Making predictions and evaluating the new model...

--- New Model Performance (with engineered features) ---
Mean Absolute Error (MAE): $43,208.77
R-squared (R²): 0.4639

Stage 6: Feature Engineering, Retraining and Evaluation Complete.


In [13]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, r2_score

def target_encode(X, y, categorical_cols):
    """
    Performs K-fold target encoding on specified categorical columns.

    Args:
        X (pd.DataFrame): The feature DataFrame.
        y (pd.Series): The target Series.
        categorical_cols (list): List of column names to target encode.

    Returns:
        pd.DataFrame: The DataFrame with target-encoded columns.
    """
    X_encoded = X.copy()
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for col in categorical_cols:
        # Check if the column exists before trying to encode it
        if col not in X_encoded.columns:
            print(f"Warning: Column '{col}' not found in DataFrame. Skipping target encoding for this column.")
            continue

        X_encoded[f'{col}_encoded'] = np.nan

        for train_index, val_index in kf.split(X):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train = y.iloc[train_index]

            # Calculate the mean target for each category in the training fold
            encoding_map = y_train.groupby(X_train[col]).mean()

            # Map these means to the validation fold
            X_encoded.loc[val_index, f'{col}_encoded'] = X_val[col].map(encoding_map)

        # Handle new categories in the test set by filling NaNs with the global mean
        global_mean = y.mean()
        X_encoded[f'{col}_encoded'].fillna(global_mean, inplace=True)

    return X_encoded.drop(columns=categorical_cols)


# --- 7. Target Encoding, Retraining, and Evaluation ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 7 ---")
print("\n1. Running new encoding strategy: Target Encoding...")

# Re-create the data preparation steps for a self-contained example
# In a full notebook, these would be run in previous cells.
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Explicitly convert 'remote_ratio' to a numeric type to ensure it's not treated as a categorical column
df['remote_ratio'] = pd.to_numeric(df['remote_ratio'], errors='coerce')
df.dropna(subset=['remote_ratio'], inplace=True)

# Define all features and target
features = ['work_year', 'experience_level', 'employment_type', 'job_title',
            'employee_residence', 'company_location', 'company_size', 'remote_ratio']
X = df[features]
y_original = df['salary_in_usd']
y_log = np.log1p(y_original)

# Identify high-cardinality categorical features for target encoding
high_cardinality_features = ['job_title', 'employee_residence', 'company_location']

# Target encode the high-cardinality features first
X_target_encoded = target_encode(X, y_log, high_cardinality_features)

# One-hot encode the remaining categorical features
# We get the list of remaining object type columns
remaining_categorical_features = X_target_encoded.select_dtypes(include='object').columns.tolist()
X_final = pd.get_dummies(X_target_encoded, columns=remaining_categorical_features, drop_first=True)

print("Features processed with Target and One-Hot Encoding.")
print(f"Shape of the final feature matrix: {X_final.shape}")

# Splitting the data
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X_final, y_log, test_size=0.2, random_state=42
)

print("\n2. Training the XGBoost Model with encoded features...")
xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=616,
    learning_rate=0.089,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

xgbr.fit(X_train, y_train_log)
print("Training complete.")

print("\n3. Making predictions and evaluating the new model...")
y_pred_log = xgbr.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- New Model Performance (with Target Encoding) ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nStage 7: Advanced Encoding, Retraining and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 7 ---

1. Running new encoding strategy: Target Encoding...


KeyError: '[263, 267, 286, 290, 360, 416, 510, 511, 518, 533, 542, 586, 594, 647, 648, 653, 668, 683, 700, 735, 782, 872, 921, 952, 965, 994, 1010, 1084, 1087, 1111, 1144, 1149, 1195, 1224, 1247, 1254, 1261, 1277, 1315, 1328, 1339, 1346, 1347, 1393, 1406, 1472, 1479, 1554, 1562, 1563, 1626, 1631, 1650, 1658, 1691, 1713, 1730, 1765, 1793, 1803, 1805, 1807, 1825, 1851, 1880, 1891, 1897, 1901, 1921, 1963, 1964, 1965, 2020, 2025, 2045, 2088, 2122, 2147, 2184, 2189, 2210, 2213, 2254, 2260, 2275, 2286, 2287, 2291, 2301, 2316, 2335, 2337, 2344, 2348, 2360, 2362, 2391, 2406, 2407, 2423, 2457, 2465, 2474, 2483, 2487, 2564, 2602, 2620, 2650, 2664, 2673, 2680, 2684, 2737, 2748, 2753, 2754, 2758, 2794, 2807, 2833, 2835, 2840, 2848, 2882, 2885, 2886, 2927, 2995, 2996, 3000, 3006, 3014, 3016, 3023, 3038, 3039, 3043, 3045, 3050, 3053, 3057, 3060, 3061, 3070, 3080, 3095, 3111, 3126, 3139, 3143, 3187, 3204, 3208, 3235, 3238, 3244, 3274, 3288, 3297, 3299, 3309, 3328, 3333, 3337, 3352, 3353, 3355, 3381, 3382, 3383, 3393, 3396, 3412, 3422, 3433, 3458, 3459, 3460, 3463, 3464, 3465, 3501, 3519, 3522, 3528, 3533, 3540, 3543, 3544, 3602, 3614, 3643, 3649, 3660, 3668, 3684, 3685, 3721, 3724, 3768, 3772, 3776, 3777, 3780, 3790, 3834, 3837, 3844, 3846, 3852, 3853, 3855, 3857, 3880, 3909, 3914, 3934, 3937, 3949, 4002, 4012, 4024, 4038, 4052, 4067, 4074, 4076, 4080, 4081, 4087, 4091, 4111, 4119, 4140, 4162, 4165, 4174, 4187, 4192, 4201, 4226, 4250, 4262, 4264, 4271, 4284, 4301, 4304, 4326, 4328, 4332, 4335, 4337, 4338, 4344, 4367, 4379, 4382, 4401, 4417, 4423, 4466, 4474, 4480, 4506, 4528, 4537, 4549, 4575, 4598, 4613, 4618, 4619, 4635, 4638, 4639, 4640, 4641, 4656, 4657, 4661, 4681, 4689, 4692, 4695, 4708, 4725, 4731, 4734, 4740, 4747, 4791, 4793, 4794, 4800, 4813, 4824, 4825, 4827, 4829, 4833, 4838, 4844, 4845, 4877, 4879, 4884, 4885, 4894, 4899, 4901, 4906, 4908, 4913, 4921, 4929, 4932, 4972, 4984, 5006, 5007, 5026, 5033, 5034, 5082, 5092, 5094, 5100, 5121, 5133, 5138, 5143, 5151, 5153, 5180, 5181, 5186, 5196, 5202, 5205, 5211, 5221, 5244, 5245, 5259, 5270, 5271, 5281, 5282, 5306, 5321, 5323, 5332, 5340, 5344, 5356, 5357, 5359, 5362, 5381, 5426, 5452, 5464, 5550, 5561, 5564, 5608, 5613, 5614, 5638, 5657, 5663, 5665, 5676, 5689, 5695, 5697, 5705, 5714, 5719, 5737, 5748, 5768, 5775, 5804, 5817, 5822, 5840, 5843, 5861, 5868, 5876, 5882, 5896, 5908, 5920, 5931, 5938, 5956, 5957, 5967, 5980, 6018, 6025, 6055, 6195, 6240, 6263, 6351, 6369, 6425, 6464, 6523, 6543, 6558, 6611, 6634, 6663, 6726, 6853, 6858, 6871, 6915, 6919, 6923, 6931, 6945, 6983, 7012, 7073, 7089, 7103, 7113, 7180, 7200, 7201, 7226, 7236, 7237, 7238, 7244, 7263, 7299, 7315, 7338, 7344, 7352, 7362, 7387, 7389, 7391, 7410, 7423, 7432, 7459, 7470, 7497, 7516, 7523, 7535, 7556, 7566, 7571, 7584, 7585, 7593, 7600, 7627, 7649, 7691, 7699, 7701, 7703, 7708, 7732, 7733, 7736, 7752, 7765, 7766, 7774, 7779, 7831, 7835, 7841, 7844, 7854, 7864, 7888, 7894, 7895, 7896, 7905, 7910, 7911, 7913, 7983, 8000, 8035, 8069, 8086, 8099, 8112, 8117, 8119, 8126, 8161, 8197, 8215, 8222, 8243, 8278, 8292, 8294, 8295, 8325, 8341, 8345, 8359, 8362, 8364, 8397, 8400, 8401, 8402, 8408, 8439, 8443, 8444, 8446, 8447, 8452, 8513, 8518, 8521, 8531, 8543, 8553, 8582, 8584, 8602, 8610, 8621, 8622, 8641, 8660, 8683, 8692, 8725, 8735, 8737, 8742, 8744, 8745, 8748, 8751, 8756, 8765, 8794, 8797, 8807, 8814, 8821, 8828, 8841, 8872, 8882, 8896, 8922, 8926, 8940, 8941, 8944, 8984, 8991, 9008, 9015, 9019, 9045, 9052, 9060, 9078, 9079, 9101, 9106, 9118, 9130, 9141, 9144, 9172, 9190, 9217, 9232, 9249, 9253, 9290, 9293, 9302, 9303, 9305, 9306, 9311, 9365, 9378, 9391, 9405, 9426, 9441, 9454, 9466, 9488, 9491, 9501, 9503, 9505, 9508, 9517, 9521, 9524, 9557, 9558, 9563, 9573, 9588, 9589, 9599, 9618, 9619, 9621, 9630, 9636, 9664, 9669, 9673, 9697, 9698, 9706, 9714, 9734, 9739, 9746, 9765, 9766, 9797, 9801, 9813, 9822, 9834, 9836, 9856, 9857, 9863, 9865, 9870, 9899, 9917, 9918, 9919, 9920, 9925, 9948, 9954, 9964, 9980, 9990, 9992, 9995, 9997, 10009, 10013, 10016, 10017, 10027, 10034, 10037, 10054, 10056, 10057, 10067, 10070, 10078, 10082, 10083, 10086, 10091, 10092] not in index'

In [14]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# --- Custom Feature Engineering Function ---
def categorize_job_title(title):
    """
    Categorizes job titles into broader, more manageable groups based on keywords.
    """
    title = title.lower()
    if 'data scientist' in title:
        return 'Data Scientist'
    elif 'data engineer' in title:
        return 'Data Engineer'
    elif 'machine learning' in title or 'ml ' in title:
        return 'ML Engineer/Scientist'
    elif 'data analyst' in title:
        return 'Data Analyst'
    elif 'analytics' in title or 'bi ' in title:
        return 'BI/Analytics'
    elif 'data architect' in title:
        return 'Data Architect'
    elif 'manager' in title or 'lead' in title or 'head' in title or 'director' in title:
        return 'Managerial'
    else:
        return 'Other'

# --- 8. Robust Feature Engineering, Retraining, and Evaluation ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 8 ---")
print("\n1. Applying robust feature engineering...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Explicitly convert 'remote_ratio' to a numeric type
df['remote_ratio'] = pd.to_numeric(df['remote_ratio'], errors='coerce')
df.dropna(subset=['remote_ratio'], inplace=True)

# Apply the new job title categorization
df['job_category'] = df['job_title'].apply(categorize_job_title)

# Define all features and target, using the new 'job_category' column
features = ['work_year', 'experience_level', 'employment_type', 'job_category',
            'employee_residence', 'company_location', 'company_size', 'remote_ratio']

# --- Validation Step ---
# Check if all required features are in the DataFrame before proceeding
missing_features = [col for col in features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    # Raise an error to stop execution
    raise KeyError("Missing required features in the DataFrame.")
# --- End of Validation Step ---

X = df[features]
y_original = df['salary_in_usd']
y_log = np.log1p(y_original)

# One-hot encode all categorical features, including the new 'job_category'
categorical_features = X.select_dtypes(include='object').columns.tolist()
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

print(f"Features processed. Shape of the final feature matrix: {X_encoded.shape}")

# Splitting the data
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X_encoded, y_log, test_size=0.2, random_state=42
)

print("\n2. Training the XGBoost Model with engineered features...")
# Using the best parameters from our tuning in Stage 5
xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=616,
    learning_rate=0.089,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

xgbr.fit(X_train, y_train_log)
print("Training complete.")

print("\n3. Making predictions and evaluating the new model...")
y_pred_log = xgbr.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- New Model Performance (with Robust Feature Engineering) ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nStage 8: Robust Feature Engineering, Retraining and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 8 ---

1. Applying robust feature engineering...
Features processed. Shape of the final feature matrix: (10093, 180)

2. Training the XGBoost Model with engineered features...
Training complete.

3. Making predictions and evaluating the new model...

--- New Model Performance (with Robust Feature Engineering) ---
Mean Absolute Error (MAE): $43,856.90
R-squared (R²): 0.4393

Stage 8: Robust Feature Engineering, Retraining and Evaluation Complete.


In [15]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate, Embedding, Flatten
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# --- 9. Deep Learning with a Feedforward Neural Network (FFNN) ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 ---")
print("\n1. Preparing data for a deep learning model...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define all features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Create a dictionary to map each categorical feature to an integer index
for col in categorical_features:
    X[col] = pd.Categorical(X[col])
    X[col] = X[col].cat.codes

# Log-transform the target variable
y_log = np.log1p(y)

# Split the data
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# 2. Building the FFNN model with embedding layers
print("\n2. Building the Feedforward Neural Network...")

# Determine the embedding size for each categorical feature
embedding_sizes = []
for col in categorical_features:
    num_unique_values = len(df[col].unique())
    embedding_size = min(50, (num_unique_values // 2) + 1)
    embedding_sizes.append((num_unique_values, embedding_size))

# Build the model
input_layers = []
embedding_layers = []

# Create an input and embedding layer for each categorical feature
for i, col in enumerate(categorical_features):
    input_layer = Input(shape=(1,), name=f'input_{col}')
    embedding_layer = Embedding(input_dim=embedding_sizes[i][0],
                                output_dim=embedding_sizes[i][1],
                                name=f'embedding_{col}')(input_layer)
    flatten_layer = Flatten(name=f'flatten_{col}')(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(flatten_layer)

# Create an input layer for numeric features
numeric_input = Input(shape=(len(numeric_features),), name='numeric_input')
input_layers.append(numeric_input)

# Concatenate all embedding and numeric inputs
all_layers = concatenate(embedding_layers + [numeric_input])

# Add dense layers for the main part of the network
dense1 = Dense(128, activation='relu')(all_layers)
dense2 = Dense(64, activation='relu')(dense1)
output = Dense(1, activation='linear')(dense2)

# Create the final model
model = Model(inputs=input_layers, outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])
print("Model built and compiled.")
model.summary()

# 3. Training the FFNN model
print("\n3. Training the FFNN model...")
# Prepare data for the model inputs
X_train_inputs = {f'input_{col}': X_train[col].values for col in categorical_features}
X_train_inputs['numeric_input'] = X_train[numeric_features].values

X_test_inputs = {f'input_{col}': X_test[col].values for col in categorical_features}
X_test_inputs['numeric_input'] = X_test[numeric_features].values

# Use Early Stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_inputs,
    y_train_log,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)
print("Training complete.")

# 4. Evaluating the FFNN model
print("\n4. Making predictions and evaluating the new model...")
y_pred_log = model.predict(X_test_inputs).flatten()
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- New Model Performance (with FFNN) ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nStage 9: Deep Learning with FFNN, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 ---

1. Preparing data for a deep learning model...

2. Building the Feedforward Neural Network...
Model built and compiled.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.Categorical(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.Categorical(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 


3. Training the FFNN model...
Epoch 1/100
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 143.9102 - mae: 7.2321 - val_loss: 0.2673 - val_mae: 0.4044
Epoch 2/100
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2988 - mae: 0.4294 - val_loss: 0.2519 - val_mae: 0.3855
Epoch 3/100
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.2454 - mae: 0.3903 - val_loss: 0.2317 - val_mae: 0.3842
Epoch 4/100
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.2546 - mae: 0.4013 - val_loss: 0.5109 - val_mae: 0.6247
Epoch 5/100
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.2959 - mae: 0.4358 - val_loss: 0.1952 - val_mae: 0.3468
Epoch 6/100
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3158 - mae: 0.4355 - val_loss: 0.1903 - val_mae: 0.3418
Epoch 7/100
[1m202/202[0m [32m━━━━━━━━━━━━━━━━

In [16]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate, Embedding, Flatten, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# --- 9. Deep Learning with a Feedforward Neural Network (FFNN) (v2) ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 (v2) ---")
print("\n1. Preparing data for a deep learning model...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define all features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Create a dictionary to map each categorical feature to an integer index
for col in categorical_features:
    X[col] = pd.Categorical(X[col])
    X[col] = X[col].cat.codes

# Log-transform the target variable
y_log = np.log1p(y)

# Split the data
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# 2. Building a more robust FFNN model with embedding layers
print("\n2. Building a more robust Feedforward Neural Network...")

# Determine the embedding size for each categorical feature
embedding_sizes = []
for col in categorical_features:
    num_unique_values = len(df[col].unique())
    # Slightly larger embedding size to allow for more complex representations
    embedding_size = min(75, (num_unique_values // 2) + 1)
    embedding_sizes.append((num_unique_values, embedding_size))

# Build the model
input_layers = []
embedding_layers = []

# Create an input and embedding layer for each categorical feature
for i, col in enumerate(categorical_features):
    input_layer = Input(shape=(1,), name=f'input_{col}')
    embedding_layer = Embedding(input_dim=embedding_sizes[i][0],
                                output_dim=embedding_sizes[i][1],
                                name=f'embedding_{col}')(input_layer)
    flatten_layer = Flatten(name=f'flatten_{col}')(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(flatten_layer)

# Create an input layer for numeric features
numeric_input = Input(shape=(len(numeric_features),), name='numeric_input')
input_layers.append(numeric_input)

# Concatenate all embedding and numeric inputs
all_layers = concatenate(embedding_layers + [numeric_input])

# Add a deeper stack of dense layers for the main part of the network
dense1 = Dense(256, activation='relu')(all_layers)
# Add a dropout layer to prevent overfitting
dropout1 = Dropout(0.3)(dense1)
dense2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense2)
dense3 = Dense(64, activation='relu')(dropout2)
output = Dense(1, activation='linear')(dense3)

# Create the final model
model = Model(inputs=input_layers, outputs=output)

# Compile the model with a slightly adjusted learning rate
optimizer = Adam(learning_rate=0.0005)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])
print("Model built and compiled.")
model.summary()

# 3. Training the FFNN model
print("\n3. Training the FFNN model...")
# Prepare data for the model inputs
X_train_inputs = {f'input_{col}': X_train[col].values for col in categorical_features}
X_train_inputs['numeric_input'] = X_train[numeric_features].values

X_test_inputs = {f'input_{col}': X_test[col].values for col in categorical_features}
X_test_inputs['numeric_input'] = X_test[numeric_features].values

# Use Early Stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_inputs,
    y_train_log,
    epochs=100,
    batch_size=64, # Increased batch size for potentially faster training
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)
print("Training complete.")

# 4. Evaluating the FFNN model
print("\n4. Making predictions and evaluating the new model...")
y_pred_log = model.predict(X_test_inputs).flatten()
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- New Model Performance (with FFNN) ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nStage 9: Deep Learning with FFNN, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 (v2) ---

1. Preparing data for a deep learning model...

2. Building a more robust Feedforward Neural Network...
Model built and compiled.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.Categorical(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.Categorical(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 


3. Training the FFNN model...
Epoch 1/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 1918.3317 - mae: 31.4022 - val_loss: 49.1664 - val_mae: 6.9824
Epoch 2/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 144.8699 - mae: 9.4492 - val_loss: 40.5122 - val_mae: 6.3436
Epoch 3/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 62.9369 - mae: 6.1734 - val_loss: 15.4604 - val_mae: 3.8976
Epoch 4/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 36.5425 - mae: 4.7219 - val_loss: 19.7428 - val_mae: 4.4125
Epoch 5/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 24.6124 - mae: 3.8827 - val_loss: 23.7074 - val_mae: 4.8412
Epoch 6/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 18.4567 - mae: 3.3225 - val_loss: 19.1245 - val_mae: 4.3420
Epoch 7/100
[1m101/101[0m [32m━━

In [17]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate, Embedding, Flatten, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# --- 9. Deep Learning with a Feedforward Neural Network (FFNN) (v3) ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 (v3) ---")
print("\n1. Preparing data for a deep learning model...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define all features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Create consistent integer codes for all categorical features before splitting
for col in categorical_features:
    unique_values = df[col].unique()
    value_to_int = {value: i for i, value in enumerate(unique_values)}
    # Add a code for unseen values
    value_to_int['<unseen>'] = len(unique_values)
    X[col] = X[col].apply(lambda x: value_to_int.get(x, value_to_int['<unseen>']))

# Log-transform the target variable
y_log = np.log1p(y)

# Split the data
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# 2. Building a more robust FFNN model with embedding layers
print("\n2. Building a more robust Feedforward Neural Network...")

# Determine the embedding size for each categorical feature
embedding_sizes = []
for col in categorical_features:
    num_unique_values = len(df[col].unique()) + 1  # +1 for the unseen category
    embedding_size = min(75, (num_unique_values // 2) + 1)
    embedding_sizes.append((num_unique_values, embedding_size))

# Build the model
input_layers = []
embedding_layers = []

# Create an input and embedding layer for each categorical feature
for i, col in enumerate(categorical_features):
    input_layer = Input(shape=(1,), name=f'input_{col}')
    embedding_layer = Embedding(input_dim=embedding_sizes[i][0],
                                output_dim=embedding_sizes[i][1],
                                name=f'embedding_{col}')(input_layer)
    flatten_layer = Flatten(name=f'flatten_{col}')(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(flatten_layer)

# Create an input layer for numeric features
numeric_input = Input(shape=(len(numeric_features),), name='numeric_input')
input_layers.append(numeric_input)

# Concatenate all embedding and numeric inputs
all_layers = concatenate(embedding_layers + [numeric_input])

# Add a deeper stack of dense layers for the main part of the network
dense1 = Dense(256, activation='relu')(all_layers)
# Add a dropout layer to prevent overfitting
dropout1 = Dropout(0.3)(dense1)
dense2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense2)
dense3 = Dense(64, activation='relu')(dropout2)
output = Dense(1, activation='linear')(dense3)

# Create the final model
model = Model(inputs=input_layers, outputs=output)

# Compile the model with a slightly adjusted learning rate
optimizer = Adam(learning_rate=0.0005)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])
print("Model built and compiled.")
model.summary()

# 3. Training the FFNN model
print("\n3. Training the FFNN model...")
# Prepare data for the model inputs
X_train_inputs = {f'input_{col}': X_train[col].values for col in categorical_features}
X_train_inputs['numeric_input'] = X_train[numeric_features].values

X_test_inputs = {f'input_{col}': X_test[col].values for col in categorical_features}
X_test_inputs['numeric_input'] = X_test[numeric_features].values

# Use Early Stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_inputs,
    y_train_log,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)
print("Training complete.")

# 4. Evaluating the FFNN model
print("\n4. Making predictions and evaluating the new model...")
y_pred_log = model.predict(X_test_inputs).flatten()
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- New Model Performance (with FFNN) ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nStage 9: Deep Learning with FFNN, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 (v3) ---

1. Preparing data for a deep learning model...

2. Building a more robust Feedforward Neural Network...
Model built and compiled.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].apply(lambda x: value_to_int.get(x, value_to_int['<unseen>']))



3. Training the FFNN model...
Epoch 1/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 4067.0151 - mae: 46.4185 - val_loss: 61.8068 - val_mae: 7.8333
Epoch 2/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 334.1782 - mae: 14.2502 - val_loss: 0.2876 - val_mae: 0.4194
Epoch 3/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 145.6368 - mae: 9.3501 - val_loss: 5.7157 - val_mae: 2.3269
Epoch 4/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 80.4981 - mae: 6.9631 - val_loss: 8.9504 - val_mae: 2.9422
Epoch 5/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 52.6705 - mae: 5.5463 - val_loss: 10.6289 - val_mae: 3.2176
Epoch 6/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 36.3701 - mae: 4.5229 - val_loss: 10.0181 - val_mae: 3.1217
Epoch 7/100
[1m101/101[0m [32m━━━

In [18]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Dense, concatenate, Embedding, Flatten, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

# --- 9. Deep Learning with a Feedforward Neural Network (FFNN) (v4) ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 (v4) ---")
print("\n1. Preparing data for a deep learning model...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define all features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Create consistent integer codes for all categorical features before splitting
for col in categorical_features:
    unique_values = df[col].unique()
    value_to_int = {value: i for i, value in enumerate(unique_values)}
    # Add a code for unseen values
    value_to_int['<unseen>'] = len(unique_values)
    X[col] = X[col].apply(lambda x: value_to_int.get(x, value_to_int['<unseen>']))

# Log-transform the target variable
y_log = np.log1p(y)

# Split the data
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# 2. Building a more robust FFNN model with embedding layers
print("\n2. Building a more robust Feedforward Neural Network...")

# Determine the embedding size for each categorical feature
embedding_sizes = []
for col in categorical_features:
    num_unique_values = len(df[col].unique()) + 1  # +1 for the unseen category
    embedding_size = min(75, (num_unique_values // 2) + 1)
    embedding_sizes.append((num_unique_values, embedding_size))

# Build the model
input_layers = []
embedding_layers = []

# Create an input and embedding layer for each categorical feature
for i, col in enumerate(categorical_features):
    input_layer = Input(shape=(1,), name=f'input_{col}')
    embedding_layer = Embedding(input_dim=embedding_sizes[i][0],
                                output_dim=embedding_sizes[i][1],
                                name=f'embedding_{col}')(input_layer)
    flatten_layer = Flatten(name=f'flatten_{col}')(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(flatten_layer)

# Create an input layer for numeric features
numeric_input = Input(shape=(len(numeric_features),), name='numeric_input')
input_layers.append(numeric_input)

# Concatenate all embedding and numeric inputs
all_layers = concatenate(embedding_layers + [numeric_input])

# Add a deeper stack of dense layers for the main part of the network
dense1 = Dense(256, activation='relu')(all_layers)
# Add a dropout layer to prevent overfitting
dropout1 = Dropout(0.3)(dense1)
dense2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense2)
dense3 = Dense(64, activation='relu')(dropout2)
output = Dense(1, activation='linear')(dense3)

# Create the final model
model = Model(inputs=input_layers, outputs=output)

# Compile the model with a smaller learning rate
optimizer = Adam(learning_rate=0.0001)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])
print("Model built and compiled.")
model.summary()

# 3. Training the FFNN model
print("\n3. Training the FFNN model...")
# Prepare data for the model inputs
X_train_inputs = {f'input_{col}': X_train[col].values for col in categorical_features}
X_train_inputs['numeric_input'] = X_train[numeric_features].values

X_test_inputs = {f'input_{col}': X_test[col].values for col in categorical_features}
X_test_inputs['numeric_input'] = X_test[numeric_features].values

# Use Early Stopping to prevent overfitting and give the model more time to converge
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True
)

history = model.fit(
    X_train_inputs,
    y_train_log,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)
print("Training complete.")

# 4. Evaluating the FFNN model
print("\n4. Making predictions and evaluating the new model...")
y_pred_log = model.predict(X_test_inputs).flatten()
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_log, y_pred_log)

print("\n--- New Model Performance (with FFNN) ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R-squared (R²): {r2:.4f}")

print("\nStage 9: Deep Learning with FFNN, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 9 (v4) ---

1. Preparing data for a deep learning model...

2. Building a more robust Feedforward Neural Network...
Model built and compiled.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].apply(lambda x: value_to_int.get(x, value_to_int['<unseen>']))



3. Training the FFNN model...
Epoch 1/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 5448.3696 - mae: 57.5944 - val_loss: 1.4144 - val_mae: 1.0089
Epoch 2/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1347.2443 - mae: 28.9655 - val_loss: 3.4390 - val_mae: 1.7814
Epoch 3/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 807.3432 - mae: 22.3979 - val_loss: 10.1750 - val_mae: 3.0940
Epoch 4/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 492.0014 - mae: 17.5209 - val_loss: 0.3482 - val_mae: 0.4528
Epoch 5/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 366.9929 - mae: 15.1039 - val_loss: 8.5291 - val_mae: 2.8656
Epoch 6/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 268.9146 - mae: 12.8903 - val_loss: 18.1614 - val_mae: 4.2259
Epoch 7/100
[1m101/101[0m 

In [19]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# --- 10. Classification with XGBoost ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 10 ---")
print("\n1. Preparing data for a classification model...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
# These bins are chosen to create a reasonably balanced set of classes
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# One-hot encode the categorical features
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Use LabelEncoder to convert the new salary tiers into integers for XGBoost
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the final feature matrix: {X_encoded.shape}")

# Splitting the data
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X_encoded, y_tier_encoded, test_size=0.2, random_state=42
)

print("\n2. Training the XGBoost Classifier Model...")
xgb_clf = xgb.XGBClassifier(
    objective='multi:softprob',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

xgb_clf.fit(X_train, y_train_tier)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with XGBoost Classifier) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 10: Classification, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 10 ---

1. Preparing data for a classification model...
Features processed and new target variable created.
Shape of the final feature matrix: (10093, 327)

2. Training the XGBoost Classifier Model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

3. Making predictions and evaluating the new classification model...

--- New Model Performance (with XGBoost Classifier) ---
Accuracy: 0.6949

Classification Report:
              precision    recall  f1-score   support

        High       0.73      0.89      0.80      1210
         Low       0.71      0.51      0.59       167
      Medium       0.56      0.37      0.45       642

    accuracy                           0.69      2019
   macro avg       0.67      0.59      0.61      2019
weighted avg       0.68      0.69      0.67      2019


Stage 10: Classification, Training and Evaluation Complete.


In [20]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# --- 11. Addressing Class Imbalance with SMOTE ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 11 ---")
print("\n1. Preparing data for a classification model with SMOTE...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# One-hot encode the categorical features
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Use LabelEncoder to convert the new salary tiers into integers for XGBoost
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X_encoded.shape}")

# Splitting the data
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X_encoded, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply SMOTE to the training data to handle class imbalance
print("\nApplying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_tier)

print("SMOTE applied. Resampled training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the XGBoost Classifier Model with balanced data...")
xgb_clf = xgb.XGBClassifier(
    objective='multi:softprob',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)

xgb_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with XGBoost Classifier and SMOTE) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 11: Classification with SMOTE, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 11 ---

1. Preparing data for a classification model with SMOTE...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 327)

Applying SMOTE to balance the training data...
SMOTE applied. Resampled training data shapes:
Features: (14994, 327)
Target: (14994,)

2. Training the XGBoost Classifier Model with balanced data...
Training complete.

3. Making predictions and evaluating the new classification model...

--- New Model Performance (with XGBoost Classifier and SMOTE) ---
Accuracy: 0.6424

Classification Report:
              precision    recall  f1-score   support

        High       0.81      0.70      0.75      1210
         Low       0.42      0.66      0.51       167
      Medium       0.48      0.52      0.50       642

    accuracy                           0.64      2019
   macro avg       0.57      0.63      0.59      2019
weighted avg       0.67      0.64      0.65     

In [21]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# --- 12. Hyperparameter Tuning for Classification ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 12 ---")
print("\n1. Preparing data for a classification model with SMOTE...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# One-hot encode the categorical features
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Use LabelEncoder to convert the new salary tiers into integers for XGBoost
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X_encoded.shape}")

# Splitting the data
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X_encoded, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply SMOTE to the training data to handle class imbalance
print("\nApplying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_tier)

print("SMOTE applied. Resampled training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Hyperparameter tuning the XGBoost Classifier Model...")

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Initialize the XGBoost Classifier
xgb_clf = xgb.XGBClassifier(
    objective='multi:softprob',
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=1, scoring='accuracy')

# Fit the grid search to the resampled training data
grid_search.fit(X_train_resampled, y_train_resampled)
print("Hyperparameter tuning complete.")
print(f"Best parameters found: {grid_search.best_params_}")

# 3. Making predictions and evaluating the new classification model with best parameters...
best_xgb_clf = grid_search.best_estimator_
y_pred_tier = best_xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with Tuned XGBoost Classifier and SMOTE) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 12: Hyperparameter Tuning, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 12 ---

1. Preparing data for a classification model with SMOTE...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 327)

Applying SMOTE to balance the training data...
SMOTE applied. Resampled training data shapes:
Features: (14994, 327)
Target: (14994,)

2. Hyperparameter tuning the XGBoost Classifier Model...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Hyperparameter tuning complete.
Best parameters found: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}

--- New Model Performance (with Tuned XGBoost Classifier and SMOTE) ---
Accuracy: 0.6399

Classification Report:
              precision    recall  f1-score   support

        High       0.80      0.71      0.75      1210
         Low       0.41      0.57      0.48       167
      Medium       0.47      0.53      0.50       642

    accuracy                           0.64      2019
   macro avg   

In [22]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# --- 13. Classification with LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 13 ---")
print("\n1. Preparing data for a classification model with SMOTE...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# One-hot encode the categorical features
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X_encoded.shape}")

# Splitting the data
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X_encoded, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply SMOTE to the training data to handle class imbalance
print("\nApplying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_tier)

print("SMOTE applied. Resampled training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with balanced data...")
# Initialize the LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM Classifier and SMOTE) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 13: Classification with LightGBM, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 13 ---

1. Preparing data for a classification model with SMOTE...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 327)

Applying SMOTE to balance the training data...
SMOTE applied. Resampled training data shapes:
Features: (14994, 327)
Target: (14994,)

2. Training the LightGBM Classifier Model with balanced data...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 189
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 92
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training complete.

3. Making predictions and evaluating

In [25]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 14. Target Encoding for LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 14 ---")
print("\n1. Preparing data with Target Encoding...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply SMOTE to the training data to handle class imbalance
print("\nApplying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_tier)

# Apply TargetEncoder to the categorical features on the resampled training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_encoded = encoder.fit_transform(X_train_resampled, y_train_resampled)
X_test_encoded = encoder.transform(X_test)

print("Target encoding applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_encoded.shape}")
print(f"Target: {y_train_resampled.shape}")
print(f"Test features: {X_test_encoded.shape}")

print("\n2. Training the LightGBM Classifier Model with encoded data...")
# Initialize the LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_encoded, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_encoded)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM Classifier and Target Encoding) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 14: Target Encoding, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 14 ---

1. Preparing data with Target Encoding...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying SMOTE to balance the training data...


ValueError: could not convert string to float: 'MI'

After the installation is complete, please run the cell again.

In [26]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 14. Target Encoding for LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 14 ---")
print("\n1. Preparing data with Target Encoding...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply TargetEncoder to the categorical features on the training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_encoded = encoder.fit_transform(X_train, y_train_tier)
X_test_encoded = encoder.transform(X_test)
print("Target encoding applied.")
print("\nSample of encoded training data:")
print(X_train_encoded.head())

# Apply SMOTE to the encoded training data to handle class imbalance
print("\nApplying SMOTE to balance the encoded training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with encoded data...")
# Initialize the LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_encoded)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM Classifier and Target Encoding) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 14: Target Encoding, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 14 ---

1. Preparing data with Target Encoding...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying TargetEncoder to categorical features...
Target encoding applied.

Sample of encoded training data:
       experience_level  employment_type  job_title  employee_residence  \
1399           0.959294          0.68331   0.685135            0.590132   
12894          0.497468          0.68331   0.685135            0.590132   
16039          0.497468          0.68331   0.685135            1.262774   
11414          0.497468          0.68331   0.619593            0.590132   
15981          0.959294          0.68331   0.295148            0.968200   

       company_location  company_size  work_year  remote_ratio  
1399           0.593736      0.662574       2024             0  
12894          0.593736      0.662574       2023             0  
16039          1.266187      0.66

In [27]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 15. LightGBM Hyperparameter Tuning ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 15 ---")
print("\n1. Preparing data with Target Encoding...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply TargetEncoder to the categorical features on the training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_encoded = encoder.fit_transform(X_train, y_train_tier)
X_test_encoded = encoder.transform(X_test)
print("Target encoding applied.")
print("\nSample of encoded training data:")
print(X_train_encoded.head())

# Apply SMOTE to the encoded training data to handle class imbalance
print("\nApplying SMOTE to balance the encoded training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Hyperparameter tuning the LightGBM Classifier Model...")

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100]
}

# Initialize the LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    random_state=42,
    n_jobs=-1
)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=lgbm_clf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=1, scoring='accuracy')

# Fit the grid search to the resampled training data
grid_search.fit(X_train_resampled, y_train_resampled)
print("Hyperparameter tuning complete.")
print(f"Best parameters found: {grid_search.best_params_}")

# 3. Making predictions and evaluating the new classification model with best parameters...
best_lgbm_clf = grid_search.best_estimator_
y_pred_tier = best_lgbm_clf.predict(X_test_encoded)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with Tuned LightGBM Classifier and Target Encoding) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 15: Hyperparameter Tuning, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 15 ---

1. Preparing data with Target Encoding...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying TargetEncoder to categorical features...
Target encoding applied.

Sample of encoded training data:
       experience_level  employment_type  job_title  employee_residence  \
1399           0.959294          0.68331   0.685135            0.590132   
12894          0.497468          0.68331   0.685135            0.590132   
16039          0.497468          0.68331   0.685135            1.262774   
11414          0.497468          0.68331   0.619593            0.590132   
15981          0.959294          0.68331   0.295148            0.968200   

       company_location  company_size  work_year  remote_ratio  
1399           0.593736      0.662574       2024             0  
12894          0.593736      0.662574       2023             0  
16039          1.266187      0.66

In [28]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 16. Feature Engineering and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 16 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: location_difference
# Calculate mean salary for company location and employee residence
avg_salary_company_loc = df.groupby('company_location')['salary_in_usd'].transform('mean')
avg_salary_employee_res = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
df['location_difference'] = avg_salary_company_loc - avg_salary_employee_res
numeric_features.append('location_difference')

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply TargetEncoder to the categorical features on the training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_encoded = encoder.fit_transform(X_train, y_train_tier)
X_test_encoded = encoder.transform(X_test)
print("Target encoding applied.")

# Apply SMOTE to the encoded training data to handle class imbalance
print("\nApplying SMOTE to balance the encoded training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_encoded)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and Engineered Features) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 16: Feature Engineering, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 16 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 9)

Applying TargetEncoder to categorical features...
Target encoding applied.

Applying SMOTE to balance the encoded training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 9)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1828
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 9
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start trainin

In [29]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 17. Job Title Grouping and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 17 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: location_difference
# Calculate mean salary for company location and employee residence
avg_salary_company_loc = df.groupby('company_location')['salary_in_usd'].transform('mean')
avg_salary_employee_res = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
df['location_difference'] = avg_salary_company_loc - avg_salary_employee_res
numeric_features.append('location_difference')

# Engineer another new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Replace the original job_title with the new grouped one and update the categorical features list
categorical_features.remove('job_title')
categorical_features.append('job_title_grouped')

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply TargetEncoder to the categorical features on the training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_encoded = encoder.fit_transform(X_train, y_train_tier)
X_test_encoded = encoder.transform(X_test)
print("Target encoding applied.")

# Apply SMOTE to the encoded training data to handle class imbalance
print("\nApplying SMOTE to balance the encoded training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_encoded)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and Job Title Grouping) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 17: Job Title Grouping, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 17 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 9)

Applying TargetEncoder to categorical features...
Target encoding applied.

Applying SMOTE to balance the encoded training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 9)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000585 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1833
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 9
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start trainin

In [31]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 18. Feature Combination and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 18 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: location_difference
# Calculate mean salary for company location and employee residence
avg_salary_company_loc = df.groupby('company_location')['salary_in_usd'].transform('mean')
avg_salary_employee_res = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
df['location_difference'] = avg_salary_company_loc - avg_salary_employee_res
# Append 'location_difference' to numeric_features only once
numeric_features.append('location_difference')

# Engineer another new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)
df['experience_job_combo'] = df['experience_level'] + '_' + df['job_title_grouped']

# Update the categorical features list
categorical_features = ['employment_type', 'employee_residence',
                        'company_location', 'company_size', 'experience_job_combo'] # Added experience_job_combo

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply TargetEncoder to the categorical features on the training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_encoded = encoder.fit_transform(X_train, y_train_tier)
X_test_encoded = encoder.transform(X_test)
print("Target encoding applied.")

# Apply SMOTE to the encoded training data to handle class imbalance
print("\nApplying SMOTE to balance the encoded training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_encoded)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and Feature Combination) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 18: Feature Combination, Training and Evaluation Complete.")

--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 18 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying TargetEncoder to categorical features...
Target encoding applied.

Applying SMOTE to balance the encoded training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 8)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008003 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1576
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 8
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start trainin

In [32]:
 # Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 19. Feature Combination with Company Size and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 19 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: location_difference
# Calculate mean salary for company location and employee residence
avg_salary_company_loc = df.groupby('company_location')['salary_in_usd'].transform('mean')
avg_salary_employee_res = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
df['location_difference'] = avg_salary_company_loc - avg_salary_employee_res
numeric_features.append('location_difference')

# Engineer another new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer new combined feature: experience_job_company_combo
df['experience_job_company_combo'] = df['experience_level'] + '_' + df['job_title_grouped'] + '_' + df['company_size']

# Replace the original job_title and other features with the new combined feature
categorical_features = ['employment_type', 'employee_residence', 'company_location']
categorical_features.append('experience_job_company_combo')

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Apply TargetEncoder to the categorical features on the training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_encoded = encoder.fit_transform(X_train, y_train_tier)
X_test_encoded = encoder.transform(X_test)
print("Target encoding applied.")

# Apply SMOTE to the encoded training data to handle class imbalance
print("\nApplying SMOTE to balance the encoded training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_encoded)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and Feature Combination) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 19: Feature Combination with Company Size, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 19 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 7)

Applying TargetEncoder to categorical features...
Target encoding applied.

Applying SMOTE to balance the encoded training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 7)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1318
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 7
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Training complete.

3. Making predictions

In [33]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 20. Polynomial Features and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 20 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: location_difference
# Calculate mean salary for company location and employee residence
avg_salary_company_loc = df.groupby('company_location')['salary_in_usd'].transform('mean')
avg_salary_employee_res = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
df['location_difference'] = avg_salary_company_loc - avg_salary_employee_res
numeric_features.append('location_difference')

# Engineer another new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer new combined feature: experience_job_company_combo
df['experience_job_company_combo'] = df['experience_level'] + '_' + df['job_title_grouped'] + '_' + df['company_size']

# Replace the original job_title and other features with the new combined feature
# We will use this list to apply target encoding
categorical_features = ['employment_type', 'employee_residence', 'company_location', 'experience_job_company_combo']
# And this list for the polynomial features
numeric_features = ['work_year', 'remote_ratio', 'location_difference']

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Separate categorical and numeric features
X_train_cat = X_train[categorical_features]
X_train_num = X_train[numeric_features]
X_test_cat = X_test[categorical_features]
X_test_num = X_test[numeric_features]

# Apply TargetEncoder to the categorical features on the training data
# and transform the test data
print("\nApplying TargetEncoder to categorical features...")
encoder = ce.TargetEncoder(cols=categorical_features)
X_train_cat_encoded = encoder.fit_transform(X_train_cat, y_train_tier)
X_test_cat_encoded = encoder.transform(X_test_cat)
print("Target encoding applied.")

# Generate polynomial features from numeric data
print("\nGenerating Polynomial Features...")
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_num)
X_test_poly = poly.transform(X_test_num)
print(f"Shape of training polynomial features: {X_train_poly.shape}")

# Combine encoded categorical and new polynomial features
X_train_combined = np.hstack((X_train_cat_encoded, X_train_poly))
X_test_combined = np.hstack((X_test_cat_encoded, X_test_poly))

print(f"Combined training data shape: {X_train_combined.shape}")

# Apply SMOTE to the combined training data to handle class imbalance
print("\nApplying SMOTE to balance the combined training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_combined)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and Polynomial Features) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 20: Polynomial Features, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 20 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 7)

Applying TargetEncoder to categorical features...
Target encoding applied.

Generating Polynomial Features...
Shape of training polynomial features: (8074, 9)
Combined training data shape: (8074, 13)

Applying SMOTE to balance the combined training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 13)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3265
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 13
[LightGBM] 




--- New Model Performance (with LightGBM and Polynomial Features) ---
Accuracy: 0.6399

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.74      0.76      1210
         Low       0.41      0.55      0.47       167
      Medium       0.47      0.47      0.47       642

    accuracy                           0.64      2019
   macro avg       0.55      0.59      0.57      2019
weighted avg       0.65      0.64      0.64      2019


Stage 20: Polynomial Features, Training and Evaluation Complete.


In [34]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 21. One-Hot Encoding and Mean Salary Feature ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 21 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: location_difference
# Calculate mean salary for company location and employee residence
avg_salary_company_loc = df.groupby('company_location')['salary_in_usd'].transform('mean')
avg_salary_employee_res = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
df['location_difference'] = avg_salary_company_loc - avg_salary_employee_res
numeric_features.append('location_difference')

# Engineer another new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer new combined feature: experience_job_company_combo
df['experience_job_company_combo'] = df['experience_level'] + '_' + df['job_title_grouped'] + '_' + df['company_size']

# Calculate the mean salary for each grouped job title
job_title_mean_salary = df.groupby('job_title_grouped')['salary_in_usd'].transform('mean')
df['job_title_mean_salary'] = job_title_mean_salary
numeric_features.append('job_title_mean_salary')

# Replace the original job_title and other features with the new combined feature
# We will use this list to apply One-Hot encoding
categorical_features = ['employment_type', 'employee_residence', 'company_location', 'experience_job_company_combo']

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Separate categorical and numeric features
X_train_cat = X_train[categorical_features]
X_train_num = X_train[numeric_features]
X_test_cat = X_test[categorical_features]
X_test_num = X_test[numeric_features]

# Use ColumnTransformer to apply OneHotEncoder to categorical features
# and leave numeric features untouched
print("\nApplying OneHotEncoder to categorical features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("One-Hot encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# Apply SMOTE to the preprocessed training data to handle class imbalance
print("\nApplying SMOTE to balance the preprocessed training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and One-Hot Encoding) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 21: One-Hot Encoding and Mean Salary Feature, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 21 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying OneHotEncoder to categorical features...
One-Hot encoding applied.
Preprocessed training data shape: (8074, 269)

Applying SMOTE to balance the preprocessed training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 269)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.137474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6087
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 162
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start train




--- New Model Performance (with LightGBM and One-Hot Encoding) ---
Accuracy: 0.6394

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.74      0.76      1210
         Low       0.43      0.52      0.47       167
      Medium       0.46      0.48      0.47       642

    accuracy                           0.64      2019
   macro avg       0.56      0.58      0.57      2019
weighted avg       0.65      0.64      0.64      2019


Stage 21: One-Hot Encoding and Mean Salary Feature, Training and Evaluation Complete.


In [35]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 22. New Feature Engineering and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 22 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer new numeric features based on location mean salaries
df['avg_salary_company_loc'] = df.groupby('company_location')['salary_in_usd'].transform('mean')
df['avg_salary_employee_res'] = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
numeric_features.extend(['avg_salary_company_loc', 'avg_salary_employee_res'])

# Engineer a new categorical feature: job_location_combo
df['job_location_combo'] = df['job_title_grouped'] + '_' + df['company_location']

# Update the feature lists for preprocessing
categorical_features = ['experience_level', 'employment_type', 'company_size', 'job_location_combo']
numeric_features = ['work_year', 'remote_ratio', 'avg_salary_company_loc', 'avg_salary_employee_res']

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Use ColumnTransformer to apply OneHotEncoder to categorical features
# and leave numeric features untouched
print("\nApplying OneHotEncoder to categorical features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("One-Hot encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# Apply SMOTE to the preprocessed training data to handle class imbalance
print("\nApplying SMOTE to balance the preprocessed training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and New Feature Engineering) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 22: New Feature Engineering, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 22 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying OneHotEncoder to categorical features...
One-Hot encoding applied.
Preprocessed training data shape: (8074, 259)

Applying SMOTE to balance the preprocessed training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 259)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4314
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 109
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start train




--- New Model Performance (with LightGBM and New Feature Engineering) ---
Accuracy: 0.6469

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.75      0.76      1210
         Low       0.45      0.53      0.49       167
      Medium       0.48      0.49      0.48       642

    accuracy                           0.65      2019
   macro avg       0.57      0.59      0.58      2019
weighted avg       0.65      0.65      0.65      2019


Stage 22: New Feature Engineering, Training and Evaluation Complete.


In [36]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 23. Remote Work and Company Size Combo and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 23 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer new numeric features based on location mean salaries
df['avg_salary_company_loc'] = df.groupby('company_location')['salary_in_usd'].transform('mean')
df['avg_salary_employee_res'] = df.groupby('employee_residence')['salary_in_usd'].transform('mean')
numeric_features = ['work_year', 'avg_salary_company_loc', 'avg_salary_employee_res']

# Engineer a new categorical feature: job_location_combo
df['job_location_combo'] = df['job_title_grouped'] + '_' + df['company_location']

# Engineer a new categorical feature: remote_company_size_combo
df['remote_company_size_combo'] = df['remote_ratio'].astype(str) + '_' + df['company_size']

# Update the feature lists for preprocessing
categorical_features = ['experience_level', 'employment_type', 'job_location_combo', 'remote_company_size_combo']
numeric_features = ['work_year', 'avg_salary_company_loc', 'avg_salary_employee_res']

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Use ColumnTransformer to apply OneHotEncoder to categorical features
# and leave numeric features untouched
print("\nApplying OneHotEncoder to categorical features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("One-Hot encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# Apply SMOTE to the preprocessed training data to handle class imbalance
print("\nApplying SMOTE to balance the preprocessed training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and New Feature Engineering) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 23: Remote Work and Company Size Combo, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 23 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 7)

Applying OneHotEncoder to categorical features...
One-Hot encoding applied.
Preprocessed training data shape: (8074, 264)

Applying SMOTE to balance the preprocessed training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 264)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.189361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4658
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 114
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start train




--- New Model Performance (with LightGBM and New Feature Engineering) ---
Accuracy: 0.6498

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.75      0.76      1210
         Low       0.45      0.55      0.50       167
      Medium       0.48      0.49      0.48       642

    accuracy                           0.65      2019
   macro avg       0.57      0.60      0.58      2019
weighted avg       0.66      0.65      0.65      2019


Stage 23: Remote Work and Company Size Combo, Training and Evaluation Complete.


In [38]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 24. Years of Experience Proxy and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 24 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer new numeric features based on location mean salaries
df['avg_salary_company_loc'] = df.groupby('company_location')['salary_in_usd'].transform('mean')
df['avg_salary_employee_res'] = df.groupby('employee_residence')['salary_in_usd'].transform('mean')

# Engineer a new categorical feature: job_location_combo
df['job_location_combo'] = df['job_title_grouped'] + '_' + df['company_location']

# Engineer a new categorical feature: remote_company_size_combo
df['remote_company_size_combo'] = df['remote_ratio'].astype(str) + '_' + df['company_size']

# Engineer a new numeric feature: years_of_experience_proxy
df['years_of_experience_proxy'] = 2024 - df['work_year']

# Update the feature lists for preprocessing
categorical_features = ['experience_level', 'employment_type', 'job_location_combo', 'remote_company_size_combo']
numeric_features = ['avg_salary_company_loc', 'avg_salary_employee_res', 'years_of_experience_proxy']

# Create the feature matrix X and target vector y
X = df[categorical_features + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Use ColumnTransformer to apply OneHotEncoder to categorical features
# and leave numeric features untouched
print("\nApplying OneHotEncoder to categorical features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("One-Hot encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# Apply SMOTE to the preprocessed training data to handle class imbalance
print("\nApplying SMOTE to balance the preprocessed training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and New Feature Engineering) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 24: Years of Experience Proxy, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 24 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 7)

Applying OneHotEncoder to categorical features...
One-Hot encoding applied.
Preprocessed training data shape: (8074, 264)

Applying SMOTE to balance the preprocessed training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 264)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4658
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 114
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start train




--- New Model Performance (with LightGBM and New Feature Engineering) ---
Accuracy: 0.6498

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.75      0.76      1210
         Low       0.45      0.55      0.50       167
      Medium       0.48      0.49      0.48       642

    accuracy                           0.65      2019
   macro avg       0.57      0.60      0.58      2019
weighted avg       0.66      0.65      0.65      2019


Stage 24: Years of Experience Proxy, Training and Evaluation Complete.


In [39]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 25. Targeted Encoding on Job Titles and LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 25 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer a new numeric feature: years_of_experience_proxy
df['years_of_experience_proxy'] = 2024 - df['work_year']

# Update the feature lists for preprocessing
categorical_features_ohe = ['experience_level', 'employment_type', 'company_size', 'employee_residence', 'company_location']
categorical_features_target_enc = ['job_title_grouped']
numeric_features = ['remote_ratio', 'years_of_experience_proxy']

# Create the feature matrix X and target vector y
X = df[categorical_features_ohe + categorical_features_target_enc + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Use ColumnTransformer to apply different encoders to different columns
print("\nApplying One-Hot and Target Encoding to features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features_ohe),
        ('target_encoder', ce.TargetEncoder(cols=categorical_features_target_enc), categorical_features_target_enc)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train_tier)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("Encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# Apply SMOTE to the preprocessed training data to handle class imbalance
print("\nApplying SMOTE to balance the preprocessed training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new features...")
# Initialize the LightGBM Classifier with the best parameters found previously
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with LightGBM and Targeted Encoding) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 25: Targeted Encoding on Job Titles, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 25 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying One-Hot and Target Encoding to features...
Encoding applied.
Preprocessed training data shape: (8074, 171)

Applying SMOTE to balance the preprocessed training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 171)
Target: (14994,)

2. Training the LightGBM Classifier Model with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4126
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 93
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training fro




--- New Model Performance (with LightGBM and Targeted Encoding) ---
Accuracy: 0.6374

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.74      0.76      1210
         Low       0.41      0.50      0.45       167
      Medium       0.46      0.48      0.47       642

    accuracy                           0.64      2019
   macro avg       0.55      0.57      0.56      2019
weighted avg       0.65      0.64      0.64      2019


Stage 25: Targeted Encoding on Job Titles, Training and Evaluation Complete.


In [41]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce
from scipy.stats import randint as sp_randint, uniform as sp_uniform

# --- 26. Hyperparameter Tuning on LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 26 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer a new numeric feature: years_of_experience_proxy
df['years_of_experience_proxy'] = 2024 - df['work_year']

# Update the feature lists for preprocessing
categorical_features_ohe = ['experience_level', 'employment_type', 'company_size', 'employee_residence', 'company_location']
categorical_features_target_enc = ['job_title_grouped']
numeric_features = ['remote_ratio', 'years_of_experience_proxy']

# Create the feature matrix X and target vector y
X = df[categorical_features_ohe + categorical_features_target_enc + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Use ColumnTransformer to apply different encoders to different columns
print("\nApplying One-Hot and Target Encoding to features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features_ohe),
        ('target_encoder', ce.TargetEncoder(cols=categorical_features_target_enc), categorical_features_target_enc)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train_tier)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("Encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# Apply SMOTE to the preprocessed training data to handle class imbalance
print("\nApplying SMOTE to balance the preprocessed training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Hyperparameter Tuning for the LightGBM Classifier Model...")

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': sp_randint(100, 1000),
    'learning_rate': sp_uniform(0.01, 0.2),
    'num_leaves': sp_randint(20, 150)
}

# Initialize the LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(objective='multiclass', num_class=3, random_state=42, n_jobs=-1)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgbm_clf,
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings that are sampled
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    random_state=42,
    verbose=1
)

# Fit RandomizedSearchCV to the resampled training data
random_search.fit(X_train_resampled, y_train_resampled)
print("Hyperparameter tuning complete.")
print(f"Best parameters found: {random_search.best_params_}")

print("\n3. Making predictions and evaluating the tuned classification model...")
# Use the best estimator found by the search to make predictions
best_lgbm_clf = random_search.best_estimator_
y_pred_tier = best_lgbm_clf.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with Hyperparameter Tuning) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 26: Hyperparameter Tuning, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 26 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying One-Hot and Target Encoding to features...
Encoding applied.
Preprocessed training data shape: (8074, 171)

Applying SMOTE to balance the preprocessed training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 171)
Target: (14994,)

2. Hyperparameter Tuning for the LightGBM Classifier Model...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

In [42]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 27. Regression to Classification Approach with LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 27 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer a new numeric feature: years_of_experience_proxy
df['years_of_experience_proxy'] = 2024 - df['work_year']

# Update the feature lists for preprocessing
categorical_features_ohe = ['experience_level', 'employment_type', 'company_size', 'employee_residence', 'company_location']
categorical_features_target_enc = ['job_title_grouped']
numeric_features = ['remote_ratio', 'years_of_experience_proxy']

# Create the feature matrix X and target vector y
X = df[categorical_features_ohe + categorical_features_target_enc + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
# Note: For regression, we split on the original continuous target `y`
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Use ColumnTransformer to apply different encoders to different columns
print("\nApplying One-Hot and Target Encoding to features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features_ohe),
        ('target_encoder', ce.TargetEncoder(cols=categorical_features_target_enc), categorical_features_target_enc)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("Encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# --- Since this is a regression problem, we don't use SMOTE to oversample the target. ---

print("\n2. Training the LightGBM Regressor Model...")
# Initialize the LightGBM Regressor
lgbm_reg = lgb.LGBMRegressor(
    objective='regression_l1', # Use L1 objective for robustness to outliers
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_reg.fit(X_train_preprocessed, y_train)
print("Training complete.")

print("\n3. Making predictions and evaluating the new regression model...")
# Make predictions on the test set
y_pred_reg = lgbm_reg.predict(X_test_preprocessed)

# Now, we convert the regression predictions into our salary tiers for classification evaluation
# We use the same bins as before
y_pred_tier_reg = pd.cut(y_pred_reg, bins=bins, labels=labels, right=False)
y_test_tier = pd.cut(y_test, bins=bins, labels=labels, right=False)
y_pred_tier_encoded = le.fit_transform(y_pred_tier_reg)
y_test_tier_encoded = le.transform(y_test_tier)

# Evaluate the performance using classification metrics
accuracy = accuracy_score(y_test_tier_encoded, y_pred_tier_encoded)

print("\n--- New Model Performance (with Regression to Classification Approach) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier_encoded, y_pred_tier_encoded, target_names=le.classes_))

print("\nStage 27: Regression to Classification, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 27 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying One-Hot and Target Encoding to features...
Encoding applied.
Preprocessed training data shape: (8074, 171)

2. Training the LightGBM Regressor Model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001800 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 8074, number of used features: 33
[LightGBM] [Info] Start training from score 138900.000000
Training complete.

3. Making predictions and evaluating the new regression model...





--- New Model Performance (with Regression to Classification Approach) ---
Accuracy: 0.6919

Classification Report:
              precision    recall  f1-score   support

        High       0.74      0.88      0.80      1210
         Low       0.66      0.47      0.55       167
      Medium       0.56      0.39      0.46       642

    accuracy                           0.69      2019
   macro avg       0.65      0.58      0.60      2019
weighted avg       0.67      0.69      0.67      2019


Stage 27: Regression to Classification, Training and Evaluation Complete.


In [43]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
import category_encoders as ce

# --- 28. Weighted Regression to Classification Approach with LightGBM ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 28 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer a new numeric feature: years_of_experience_proxy
df['years_of_experience_proxy'] = 2024 - df['work_year']

# Update the feature lists for preprocessing
categorical_features_ohe = ['experience_level', 'employment_type', 'company_size', 'employee_residence', 'company_location']
categorical_features_target_enc = ['job_title_grouped']
numeric_features = ['remote_ratio', 'years_of_experience_proxy']

# Create the feature matrix X and target vector y
X = df[categorical_features_ohe + categorical_features_target_enc + numeric_features]
y = df[target]

# Define salary bins and create a new target variable for weighting
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
_, _, y_train_tier, _ = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)

# Use ColumnTransformer to apply different encoders to different columns
print("\nApplying One-Hot and Target Encoding to features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features_ohe),
        ('target_encoder', ce.TargetEncoder(cols=categorical_features_target_enc), categorical_features_target_enc)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("Encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# --- Creating a sample weight array to address class imbalance ---
print("\nCreating sample weights to address class imbalance...")
class_counts = pd.Series(y_train_tier).value_counts()
total_samples = len(y_train_tier)
class_weights = total_samples / (len(class_counts) * class_counts)

# Map the weights to the training data
sample_weights = np.array([class_weights[tier] for tier in y_train_tier])

print("\n2. Training the LightGBM Regressor Model with sample weights...")
# Initialize the LightGBM Regressor
lgbm_reg = lgb.LGBMRegressor(
    objective='regression_l1',
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

# Fit the model with the sample weights
lgbm_reg.fit(X_train_preprocessed, y_train, sample_weight=sample_weights)
print("Training complete.")

print("\n3. Making predictions and evaluating the new regression model...")
# Make predictions on the test set
y_pred_reg = lgbm_reg.predict(X_test_preprocessed)

# Now, we convert the regression predictions into our salary tiers for classification evaluation
# We use the same bins as before
y_pred_tier_reg = pd.cut(y_pred_reg, bins=bins, labels=labels, right=False)
y_test_tier = pd.cut(y_test, bins=bins, labels=labels, right=False)
y_pred_tier_encoded = le.fit_transform(y_pred_tier_reg)
y_test_tier_encoded = le.transform(y_test_tier)

# Evaluate the performance using classification metrics
accuracy = accuracy_score(y_test_tier_encoded, y_pred_tier_encoded)

print("\n--- New Model Performance (with Weighted Regression) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier_encoded, y_pred_tier_encoded, target_names=le.classes_))

print("\nStage 28: Weighted Regression, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 28 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 8)

Applying One-Hot and Target Encoding to features...
Encoding applied.
Preprocessed training data shape: (8074, 171)

Creating sample weights to address class imbalance...

2. Training the LightGBM Regressor Model with sample weights...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 8074, number of used features: 33
[LightGBM] [Info] Start training from score 92280.000000
Training complete.

3. Making predictions and evaluating the new regression model...

--- New Model Performance (with Weighted Regression) -



In [44]:
# Import necessary libraries for this stage
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import category_encoders as ce

# --- 29. Statistical Feature Engineering and LightGBM Classifier ---
print("--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 29 ---")
print("\n1. Feature Engineering and Data Preparation...")

# Re-create the data preparation steps for a self-contained example
df = pd.read_csv('salaries.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Define features and target
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence',
                        'company_location', 'company_size']
numeric_features = ['work_year', 'remote_ratio']
target = 'salary_in_usd'

# Check if all required features are in the DataFrame before proceeding
all_features = categorical_features + numeric_features + [target]
missing_features = [col for col in all_features if col not in df.columns]
if missing_features:
    print(f"Error: The following required features are missing from the dataset: {missing_features}")
    print("Please check your 'salaries.csv' file.")
    raise KeyError("Missing required features in the DataFrame.")

# Engineer a new feature: Grouping job titles
# Create a dictionary to map similar job titles to a general category
title_mapping = {
    # Data Scientists
    'Data Scientist': 'Data Scientist', 'Principal Data Scientist': 'Data Scientist',
    'Applied Data Scientist': 'Data Scientist', 'AI Scientist': 'Data Scientist',
    'Staff Data Scientist': 'Data Scientist', 'Research Scientist': 'Data Scientist',
    'Data Science Consultant': 'Data Scientist', 'Lead Data Scientist': 'Data Scientist',
    'Data Science Lead': 'Data Scientist',
    # Data Engineers
    'Data Engineer': 'Data Engineer', 'Lead Data Engineer': 'Data Engineer',
    'Principal Data Engineer': 'Data Engineer', 'Data Engineering Manager': 'Data Engineer',
    'Data Engineering Specialist': 'Data Engineer', 'Staff Data Engineer': 'Data Engineer',
    'Cloud Data Engineer': 'Data Engineer', 'Director of Data Engineering': 'Data Engineer',
    # Data Analysts
    'Data Analyst': 'Data Analyst', 'Lead Data Analyst': 'Data Analyst',
    'Business Intelligence Analyst': 'Data Analyst', 'Principal Data Analyst': 'Data Analyst',
    'Data Analytics Manager': 'Data Analyst', 'Data Analytics Lead': 'Data Analyst',
    'Data Analytics Specialist': 'Data Analyst',
    # Machine Learning Specialists
    'Machine Learning Engineer': 'Machine Learning Engineer', 'ML Engineer': 'Machine Learning Engineer',
    'Principal Machine Learning Engineer': 'Machine Learning Engineer', 'Machine Learning Scientist': 'Machine Learning Engineer',
    'Machine Learning Manager': 'Machine Learning Engineer', 'Applied Machine Learning Scientist': 'Machine Learning Engineer',
    'Head of Machine Learning': 'Machine Learning Engineer',
    # Other Roles
    'Data Architect': 'Data Architect', 'Head of Data': 'Head of Data',
    'Data Science Manager': 'Data Science Manager', 'Director of Data Science': 'Data Science Manager',
    'Head of Data Science': 'Data Science Manager', 'Analytics Engineer': 'Analytics Engineer',
    'BI Analyst': 'Business Intelligence Analyst', 'BI Developer': 'Business Intelligence Analyst',
    'Business Intelligence Engineer': 'Business Intelligence Analyst', 'ETL Developer': 'Data Engineer',
    'Computer Vision Engineer': 'Computer Vision Engineer', 'NLP Engineer': 'NLP Engineer',
    'Research Engineer': 'Research Engineer', 'Financial Data Analyst': 'Data Analyst'
}
df['job_title_grouped'] = df['job_title'].apply(
    lambda x: title_mapping.get(x, 'Other')
)

# Engineer a new numeric feature: years_of_experience_proxy
df['years_of_experience_proxy'] = 2024 - df['work_year']

# Engineer new binary features for top locations
df['is_us_company'] = (df['company_location'] == 'US').astype(int)
df['is_gb_company'] = (df['company_location'] == 'GB').astype(int)
df['is_us_residence'] = (df['employee_residence'] == 'US').astype(int)
df['is_gb_residence'] = (df['employee_residence'] == 'GB').astype(int)

# Update the feature lists for preprocessing
categorical_features_ohe = ['experience_level', 'employment_type', 'company_size']
categorical_features_target_enc = ['job_title_grouped']
numeric_features = ['remote_ratio', 'years_of_experience_proxy', 'is_us_company',
                    'is_gb_company', 'is_us_residence', 'is_gb_residence']

# Create the feature matrix X and target vector y
X = df[categorical_features_ohe + categorical_features_target_enc + numeric_features]
y = df[target]

# Define salary bins and create a new target variable
bins = [0, 60000, 120000, np.inf]
labels = ['Low', 'Medium', 'High']
df['salary_tier'] = pd.cut(df['salary_in_usd'], bins=bins, labels=labels, right=False)

# Use LabelEncoder to convert the new salary tiers into integers for LightGBM
le = LabelEncoder()
y_tier_encoded = le.fit_transform(df['salary_tier'])

print("Features processed and new target variable created.")
print(f"Shape of the initial feature matrix: {X.shape}")

# Splitting the data before encoding to prevent data leakage
X_train, X_test, y_train_tier, y_test_tier = train_test_split(
    X, y_tier_encoded, test_size=0.2, random_state=42
)
_, _, y_train_continuous, _ = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Use ColumnTransformer to apply different encoders to different columns
print("\nApplying One-Hot and Target Encoding to features...")
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features_ohe),
        ('target_encoder', ce.TargetEncoder(cols=categorical_features_target_enc), categorical_features_target_enc)
    ],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train_continuous)
# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)
print("Encoding applied.")

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")

# Apply SMOTE to the preprocessed training data to handle class imbalance
print("\nApplying SMOTE to balance the preprocessed training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train_tier)
print("SMOTE applied. Resampled and encoded training data shapes:")
print(f"Features: {X_train_resampled.shape}")
print(f"Target: {y_train_resampled.shape}")

print("\n2. Training the LightGBM Classifier Model with new statistical features...")
# Initialize the LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=100,
    random_state=42,
    n_jobs=-1
)

lgbm_clf.fit(X_train_resampled, y_train_resampled)
print("Training complete.")

print("\n3. Making predictions and evaluating the new classification model...")
y_pred_tier = lgbm_clf.predict(X_test_preprocessed)
accuracy = accuracy_score(y_test_tier, y_pred_tier)

print("\n--- New Model Performance (with Statistical Feature Engineering) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_tier, y_pred_tier, target_names=le.classes_))

print("\nStage 29: Statistical Feature Engineering, Training and Evaluation Complete.")


--- MLPayGrade Advanced Track: Cleaned Notebook Code - Stage 29 ---

1. Feature Engineering and Data Preparation...
Features processed and new target variable created.
Shape of the initial feature matrix: (10093, 10)

Applying One-Hot and Target Encoding to features...
Encoding applied.
Preprocessed training data shape: (8074, 18)

Applying SMOTE to balance the preprocessed training data...
SMOTE applied. Resampled and encoded training data shapes:
Features: (14994, 18)
Target: (14994,)

2. Training the LightGBM Classifier Model with new statistical features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025987 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3239
[LightGBM] [Info] Number of data points in the train set: 14994, number of used features: 18
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start t




--- New Model Performance (with Statistical Feature Engineering) ---
Accuracy: 0.6340

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.74      0.76      1210
         Low       0.38      0.43      0.40       167
      Medium       0.46      0.48      0.47       642

    accuracy                           0.63      2019
   macro avg       0.54      0.55      0.54      2019
weighted avg       0.64      0.63      0.64      2019


Stage 29: Statistical Feature Engineering, Training and Evaluation Complete.
