In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
file_path = 'Target Response DB.csv'
df = pd.read_csv(file_path)

# Encode the responses
response_columns = ['Question 7', 'Question 8', 'Question 9', 'Question 10', 'Question 11']
df_encoded = df.copy()

# Encoding A, B, C, D to 0, 1, 2, 3
response_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
reverse_mapping = {v: k for k, v in response_mapping.items()}
for col in response_columns:
    df_encoded[col] = df_encoded[col].map(response_mapping)

# Split data into features and labels
X = df_encoded['Job description']
y = df_encoded[response_columns]

# Text Preprocessing and Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')  # Using bigrams and removing stop words
X_vectorized = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Define the model
base_model = GradientBoostingClassifier(random_state=42)

# Define the hyperparameters grid
param_grid = {
    'estimator__n_estimators': [50, 100],
    'estimator__learning_rate': [0.05, 0.1],
    'estimator__max_depth': [3, 5]
}

# Initialize Grid Search
grid_search = GridSearchCV(MultiOutputClassifier(base_model), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Convert predictions to DataFrame for easy manipulation
predictions = pd.DataFrame(y_pred, columns=response_columns)
for col in response_columns:
    predictions[col] = predictions[col].map(reverse_mapping)

# Initialize metric dictionaries
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

# Calculate evaluation metrics for each question
for i, col in enumerate(response_columns):
    y_test_col = y_test[col].map(reverse_mapping)
    metrics['accuracy'].append(accuracy_score(y_test_col, predictions[col]))
    metrics['precision'].append(precision_score(y_test_col, predictions[col], average='macro', zero_division=1))
    metrics['recall'].append(recall_score(y_test_col, predictions[col], average='macro', zero_division=1))
    metrics['f1'].append(f1_score(y_test_col, predictions[col], average='macro', zero_division=1))

# Average the metrics across all questions
avg_metrics = {metric: sum(values) / len(values) for metric, values in metrics.items()}

def adjust_predictions(predictions):
    for index, row in predictions.iterrows():
        main_focus_found = False
        for col in reversed(predictions.columns):
            if main_focus_found:
                # Mark all preceding questions as 'D'
                predictions.at[index, col] = 'D'
            if row[col] == 'A':
                main_focus_found = True
    return predictions

# Adjust predictions based on the rule
adjusted_predictions = adjust_predictions(predictions.copy())

# Example prediction
new_description = ["To provide of an effective Joinery resource to ensure the University fabric is efficiently maintained..."]
new_description_vectorized = vectorizer.transform(new_description)
predictions_new = best_model.predict(new_description_vectorized)

# Decode the predictions
predictions_new_df = pd.DataFrame(predictions_new, columns=response_columns)
for col in response_columns:
    predictions_new_df[col] = predictions_new_df[col].map(reverse_mapping)

# Adjust the new predictions
adjusted_predictions_new = adjust_predictions(predictions_new_df.copy())

# Print the job description and its predicted responses
print("Job Description:", new_description[0])
print("Predicted Responses:", adjusted_predictions_new.iloc[0].to_dict())

# Print evaluation metrics
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Precision: {avg_metrics['precision']:.4f}")
print(f"Recall: {avg_metrics['recall']:.4f}")
print(f"F1 Score: {avg_metrics['f1']:.4f}")




Job Description: To provide of an effective Joinery resource to ensure the University fabric is efficiently maintained...
Predicted Responses: {'Question 7': 'B', 'Question 8': 'C', 'Question 9': 'D', 'Question 10': 'B', 'Question 11': 'B'}
Accuracy: 0.6333
Precision: 0.6434
Recall: 0.6146
F1 Score: 0.5297


ValueError: 
All the 36 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 216, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
             ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
    self.results = batch()
                   ^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 49, in _fit_estimator
    estimator.fit(X, y, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [1 2 3]

--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 216, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
             ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
    self.results = batch()
                   ^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 49, in _fit_estimator
    estimator.fit(X, y, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [1 3]


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'Target Response DB.csv'
df = pd.read_csv(file_path)

# Encode the responses
response_columns = ['Question 7', 'Question 8', 'Question 9', 'Question 10', 'Question 11']
df_encoded = df.copy()

# Encoding A, B, C, D to 0, 1, 2, 3
response_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
reverse_mapping = {v: k for k, v in response_mapping.items()}
for col in response_columns:
    df_encoded[col] = df_encoded[col].map(response_mapping)

# Encode each label column
for col in response_columns:
    le = LabelEncoder()
    y_train[col] = le.fit_transform(y_train[col])
    y_test[col] = le.transform(y_test[col])

# Re-run the model training and evaluation
grid_search_xgb.fit(X_train, y_train)

# Split data into features and labels
X = df_encoded['Job description']
y = df_encoded[response_columns]

# Text Preprocessing and Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')  # Using bigrams and removing stop words
X_vectorized = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Define the model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Define the hyperparameters grid
param_grid = {
    'estimator__n_estimators': [100, 200],
    'estimator__learning_rate': [0.01, 0.1],
    'estimator__max_depth': [3, 5, 7]
}

# Initialize Grid Search
grid_search_xgb = GridSearchCV(MultiOutputClassifier(xgb_model), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

# Best model from grid search
best_xgb_model = grid_search_xgb.best_estimator_

# Predict on the test set
y_pred_xgb = best_xgb_model.predict(X_test)

# Convert predictions to DataFrame for easy manipulation
predictions_xgb = pd.DataFrame(y_pred_xgb, columns=response_columns)
for col in response_columns:
    predictions_xgb[col] = predictions_xgb[col].map(reverse_mapping)

# Initialize metric dictionaries
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

# Calculate evaluation metrics for each question
for i, col in enumerate(response_columns):
    y_test_col = y_test[col].map(reverse_mapping)
    metrics['accuracy'].append(accuracy_score(y_test_col, predictions_xgb[col]))
    metrics['precision'].append(precision_score(y_test_col, predictions_xgb[col], average='macro', zero_division=1))
    metrics['recall'].append(recall_score(y_test_col, predictions_xgb[col], average='macro', zero_division=1))
    metrics['f1'].append(f1_score(y_test_col, predictions_xgb[col], average='macro', zero_division=1))

# Average the metrics across all questions
avg_metrics = {metric: sum(values) / len(values) for metric, values in metrics.items()}

def adjust_predictions(predictions):
    for index, row in predictions.iterrows():
        main_focus_found = False
        for col in reversed(predictions.columns):
            if main_focus_found:
                # Mark all preceding questions as 'D'
                predictions.at[index, col] = 'D'
            if row[col] == 'A':
                main_focus_found = True
    return predictions

# Adjust predictions based on the rule
adjusted_predictions = adjust_predictions(predictions_xgb.copy())

# Example prediction
new_description = ["To provide of an effective Joinery resource to ensure the University fabric is efficiently maintained..."]
new_description_vectorized = vectorizer.transform(new_description)
predictions_new = best_xgb_model.predict(new_description_vectorized)

# Decode the predictions
predictions_new_df = pd.DataFrame(predictions_new, columns=response_columns)
for col in response_columns:
    predictions_new_df[col] = predictions_new_df[col].map(reverse_mapping)

# Adjust the new predictions
adjusted_predictions_new = adjust_predictions(predictions_new_df.copy())

# Print the job description and its predicted responses
print("Job Description:", new_description[0])
print("Predicted Responses:", adjusted_predictions_new.iloc[0].to_dict())

# Print evaluation metrics
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Precision: {avg_metrics['precision']:.4f}")
print(f"Recall: {avg_metrics['recall']:.4f}")
print(f"F1 Score: {avg_metrics['f1']:.4f}")


24 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 216, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel

ValueError: 
All the 36 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 216, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
             ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
    self.results = batch()
                   ^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 49, in _fit_estimator
    estimator.fit(X, y, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [1 2 3]

--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 216, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
             ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
    self.results = batch()
                   ^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\sklearn\multioutput.py", line 49, in _fit_estimator
    estimator.fit(X, y, **fit_params)
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\nosao\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [1 3]


In [9]:
for col in response_columns:
    print(f"Classes in {col}: {y_train[col].unique()}")


Classes in Question 7: [1 3 2]
Classes in Question 8: [1 3 2]
Classes in Question 9: [3 0 1]
Classes in Question 10: [2 1 3]
Classes in Question 11: [1 2]


In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'Target Response DB.csv'
df = pd.read_csv(file_path)

# Encode the responses
response_columns = ['Question 7', 'Question 8', 'Question 9', 'Question 10', 'Question 11']
df_encoded = df.copy()

# Encoding A, B, C, D to 0, 1, 2, 3
response_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
reverse_mapping = {v: k for k, v in response_mapping.items()}
for col in response_columns:
    df_encoded[col] = df_encoded[col].map(response_mapping)

# Split data into features and labels
X = df_encoded['Job description']
y = df_encoded[response_columns]

# Encode each label column to ensure class labels are continuous and start from 0
label_encoders = {}
for col in response_columns:
    le = LabelEncoder()
    y[col] = le.fit_transform(y[col])
    label_encoders[col] = le

# Text Preprocessing and Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')  # Using bigrams and removing stop words
X_vectorized = vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Define the XGBClassifier model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Define the MultiOutputClassifier with the XGBClassifier
chain_model = MultiOutputClassifier(xgb_model)

# Define the hyperparameters grid
param_grid = {
    'estimator__n_estimators': [100, 200],
    'estimator__learning_rate': [0.01, 0.1],
    'estimator__max_depth': [3, 5, 7]
}

# Initialize Grid Search
grid_search_xgb = GridSearchCV(chain_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

# Best model from grid search
best_xgb_model = grid_search_xgb.best_estimator_

# Predict on the test set
y_pred_xgb = best_xgb_model.predict(X_test)

# Convert predictions to DataFrame for easy manipulation
predictions_xgb = pd.DataFrame(y_pred_xgb, columns=response_columns)

# Decode the predictions back to the original labels
for col in response_columns:
    predictions_xgb[col] = label_encoders[col].inverse_transform(predictions_xgb[col])

# Initialize metric dictionaries
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': []
}

# Decode y_test for evaluation
for col in response_columns:
    y_test[col] = label_encoders[col].inverse_transform(y_test[col])

# Calculate evaluation metrics for each question
for i, col in enumerate(response_columns):
    metrics['accuracy'].append(accuracy_score(y_test[col], predictions_xgb[col]))
    metrics['precision'].append(precision_score(y_test[col], predictions_xgb[col], average='macro', zero_division=1))
    metrics['recall'].append(recall_score(y_test[col], predictions_xgb[col], average='macro', zero_division=1))
    metrics['f1'].append(f1_score(y_test[col], predictions_xgb[col], average='macro', zero_division=1))

# Average the metrics across all questions
avg_metrics = {metric: sum(values) / len(values) for metric, values in metrics.items()}


# Adjust predictions based on the rule
adjusted_predictions = adjust_predictions(predictions_xgb.copy())

# Example prediction
new_description = ["To provide of an effective Joinery resource to ensure the University fabric is efficiently maintained..."]
new_description_vectorized = vectorizer.transform(new_description)
predictions_new = best_xgb_model.predict(new_description_vectorized)

# Decode the predictions for the new description
predictions_new_df = pd.DataFrame(predictions_new, columns=response_columns)
for col in response_columns:
    predictions_new_df[col] = label_encoders[col].inverse_transform(predictions_new_df[col])

# Adjust the new predictions
adjusted_predictions_new = adjust_predictions(predictions_new_df.copy())


# Decode the numeric predictions back to their original labels
decoded_predictions = {col: reverse_mapping[val] for col, val in adjusted_predictions_new.iloc[0].items()}
print("Predicted Responses:", decoded_predictions)

# Print evaluation metrics
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Precision: {avg_metrics['precision']:.4f}")
print(f"Recall: {avg_metrics['recall']:.4f}")
print(f"F1 Score: {avg_metrics['f1']:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Predicted Responses: {'Question 7': 'B', 'Question 8': 'C', 'Question 9': 'D', 'Question 10': 'C', 'Question 11': 'B'}
Accuracy: 0.6333
Precision: 0.7040
Recall: 0.6167
F1 Score: 0.5197
