# Job Description Multi-Label Classification
This project implements a machine learning pipeline to classify job descriptions into multiple categories based on responses to a set of predefined questions. The model is trained using a multi-label classification approach, leveraging XGBoost classifier wrapped in a MultiOutputClassifier to handle the multiple outputs. The project also includes hyperparameter tuning using GridSearchCV to optimize model performance.

In [11]:
# pip is a package manager for Python libraries, and we use it to install the required libraries for this project.
!pip install pandas scikit-learn seaborn xgboost
# Import necessary libraries for data manipulation, feature extraction, model training, and evaluation
import pandas as pd  # For data manipulation and analysis
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF features
from sklearn.multioutput import MultiOutputClassifier  # For multi-label classification
from sklearn.model_selection import train_test_split, GridSearchCV  # For splitting data and hyperparameter tuning
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # For evaluating model performance
import xgboost as xgb  # XGBoost classifier
from sklearn.preprocessing import LabelEncoder  # For encoding target labels

# Step 1: Load the dataset
# The dataset is expected to be a CSV file with job descriptions and responses to multiple questions.
file_path = 'C:/Users/nosao/Desktop/Maxwell-Text Classification/Target Response/data/Target Response DB.csv'  # Update with your actual file path
df = pd.read_csv(file_path)

# Step 2: Encode the responses
# Responses are encoded as integers for model compatibility.
response_columns = ['Question 7', 'Question 8', 'Question 9', 'Question 10', 'Question 11']
df_encoded = df.copy()

# Map response labels 'A', 'B', 'C', 'D' to integers 0, 1, 2, 3 respectively
response_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
reverse_mapping = {v: k for k, v in response_mapping.items()}  # Reverse mapping for decoding predictions later
for col in response_columns:
    df_encoded[col] = df_encoded[col].map(response_mapping)

# Step 3: Split data into features and labels
# 'Job description' is the feature, and the responses to the questions are the labels.
X = df_encoded['Job description']  # Features: job descriptions
y = df_encoded[response_columns]  # Labels: encoded responses to each question

# Encode each label column to ensure class labels are continuous and start from 0
label_encoders = {}
for col in response_columns:
    le = LabelEncoder()  # Initialize a LabelEncoder for each target column
    y[col] = le.fit_transform(y[col])  # Encode the target column
    label_encoders[col] = le  # Store the LabelEncoder for later use

# Step 4: Text Preprocessing and Vectorization
# Convert the job descriptions to TF-IDF features.
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')  # Use bigrams and remove common English stopwords
X_vectorized = vectorizer.fit_transform(X)

# Step 5: Split data into training and testing sets
# The data is split into training (80%) and testing (20%) sets.
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Step 6: Define the XGBClassifier model
# XGBoost is a powerful gradient boosting model. We use it here as the base estimator.
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Step 7: Define the MultiOutputClassifier
# MultiOutputClassifier allows us to apply the XGBoost model to multiple output labels (i.e., multiple questions).
chain_model = MultiOutputClassifier(xgb_model)

# Step 8: Define the hyperparameters grid
# GridSearchCV will search over these hyperparameters to find the best model.
param_grid = {
    'estimator__n_estimators': [100, 200],  # Number of boosting rounds
    'estimator__learning_rate': [0.01, 0.1],  # Learning rate for boosting
    'estimator__max_depth': [3, 5, 7]  # Maximum depth of a tree
}

# Step 9: Initialize and perform Grid Search
# GridSearchCV is used to perform an exhaustive search over the hyperparameters.
grid_search_xgb = GridSearchCV(chain_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)  # Fit the grid search to the training data

# Get the best model from the grid search
best_xgb_model = grid_search_xgb.best_estimator_

# Step 10: Predict on the test set
# Use the best model to make predictions on the test set.
y_pred_xgb = best_xgb_model.predict(X_test)

# Convert predictions to a DataFrame for easy manipulation and decoding
predictions_xgb = pd.DataFrame(y_pred_xgb, columns=response_columns)

# Decode the predictions back to the original labels (A, B, C, D)
for col in response_columns:
    predictions_xgb[col] = label_encoders[col].inverse_transform(predictions_xgb[col])

# Step 11: Initialize metric dictionaries
# These dictionaries will store evaluation metrics for each question.
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

# Decode y_test for evaluation
for col in response_columns:
    y_test[col] = label_encoders[col].inverse_transform(y_test[col])

# Step 12: Calculate evaluation metrics for each question
# Metrics calculated include accuracy, precision, recall, and F1 score.
for i, col in enumerate(response_columns):
    metrics['accuracy'].append(accuracy_score(y_test[col], predictions_xgb[col]))
    metrics['precision'].append(precision_score(y_test[col], predictions_xgb[col], average='macro', zero_division=1))
    metrics['recall'].append(recall_score(y_test[col], predictions_xgb[col], average='macro', zero_division=1))
    metrics['f1'].append(f1_score(y_test[col], predictions_xgb[col], average='macro', zero_division=1))

# Step 13: Average the metrics across all questions
# Calculate the average of each metric across all questions.
avg_metrics = {metric: sum(values) / len(values) for metric, values in metrics.items()}

# Step 14: Adjust predictions based on a custom rule (assuming adjust_predictions is defined elsewhere)
# Adjusted predictions may be modified based on a specific business rule or condition.
# Define a function to adjust predictions based on a custom rule
# This function adjusts the predictions such that once an 'A' is found, all preceding responses are set to 'D'.
def adjust_predictions(predictions):
    for index, row in predictions.iterrows():
        main_focus_found = False
        for col in reversed(predictions.columns):
            if main_focus_found:
                predictions.at[index, col] = 'D'  # Mark all preceding questions as 'D'
            if row[col] == 'A':
                main_focus_found = True
    return predictions

# Step 15: Adjust predictions using the custom rule
adjusted_predictions = adjust_predictions(predictions_xgb.copy())

# Step 16: Example prediction for a new job description
# Predict responses for a new job description.
new_description = ["To provide of an effective Joinery resource to ensure the University fabric is efficiently maintained..."]
new_description_vectorized = vectorizer.transform(new_description)
predictions_new = best_xgb_model.predict(new_description_vectorized)

# Convert and decode the predictions for the new description
predictions_new_df = pd.DataFrame(predictions_new, columns=response_columns)
for col in response_columns:
    predictions_new_df[col] = label_encoders[col].inverse_transform(predictions_new_df[col])

# Adjust the new predictions based on the custom rule
adjusted_predictions_new = adjust_predictions(predictions_new_df.copy())

# Step 17: Decode the numeric predictions back to their original labels
# Convert the adjusted predictions into the original response labels (A, B, C, D).
decoded_predictions = {col: reverse_mapping[val] for col, val in adjusted_predictions_new.iloc[0].items()}
print("Predicted Responses:", decoded_predictions)

# Step 18: Print evaluation metrics
# Display the averaged accuracy, precision, recall, and F1 score.
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Precision: {avg_metrics['precision']:.4f}")
print(f"Recall: {avg_metrics['recall']:.4f}")
print(f"F1 Score: {avg_metrics['f1']:.4f}")







[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])  # Encode the target column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])  # Encode the target column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

Predicted Responses: {'Question 7': 'B', 'Question 8': 'C', 'Question 9': 'D', 'Question 10': 'C', 'Question 11': 'B'}
Accuracy: 0.6333
Precision: 0.7040
Recall: 0.6167
F1 Score: 0.5197


In [13]:
# Step 16: Example prediction for a new job description
# Predict responses for a new job description.
new_description = ["To lead the procurement and sourcing function for Leeds Trinity University, ensuring all goods and services are sourced centrally or through agreed partners & approved routes, and that there is effective negotiation and supplier management both in the purchase and the monitoring of supplier performance. To liaise with the relevant purchasing consortia to achieve best value, to benchmark where appropriate and to market test certain activities."]
new_description_vectorized = vectorizer.transform(new_description)
predictions_new = best_xgb_model.predict(new_description_vectorized)

# Convert and decode the predictions for the new description
predictions_new_df = pd.DataFrame(predictions_new, columns=response_columns)
for col in response_columns:
    predictions_new_df[col] = label_encoders[col].inverse_transform(predictions_new_df[col])

# Adjust the new predictions based on the custom rule
adjusted_predictions_new = adjust_predictions(predictions_new_df.copy())

# Step 17: Decode the numeric predictions back to their original labels
# Convert the adjusted predictions into the original response labels (A, B, C, D).
decoded_predictions = {col: reverse_mapping[val] for col, val in adjusted_predictions_new.iloc[0].items()}
print("Predicted Responses:", decoded_predictions)

# Step 18: Print evaluation metrics
# Display the averaged accuracy, precision, recall, and F1 score.
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Precision: {avg_metrics['precision']:.4f}")
print(f"Recall: {avg_metrics['recall']:.4f}")
print(f"F1 Score: {avg_metrics['f1']:.4f}")


Predicted Responses: {'Question 7': 'B', 'Question 8': 'B', 'Question 9': 'D', 'Question 10': 'B', 'Question 11': 'B'}
Accuracy: 0.6333
Precision: 0.7040
Recall: 0.6167
F1 Score: 0.5197
