#Job Description Classification and Response Prediction

This notebook demonstrates how to preprocess job descriptions, train machine learning models (Logistic Regression, SVM, Random Forest), and predict responses to job descriptions using different classifiers


In [4]:
# Import necessary libraries
# pip is a package manager for Python libraries, and we use it to install the required libraries for this project.
!pip install nltk pandas scikit-learn

# Import essential modules and functions
import pandas as pd  # For data manipulation and analysis
from sklearn.model_selection import train_test_split, GridSearchCV  # For splitting the data and performing grid search for hyperparameter tuning
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text data into TF-IDF features
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest Classifier
from sklearn.pipeline import Pipeline  # For creating machine learning pipelines
from sklearn.metrics import classification_report, accuracy_score  # For evaluating model performance
from nltk.corpus import stopwords  # For accessing a list of common stopwords
from nltk.stem import WordNetLemmatizer  # For lemmatizing words (reducing words to their base form)
from nltk.tokenize import word_tokenize  # For tokenizing text into words
import nltk  # Natural Language Toolkit (NLTK) for text processing

# Download necessary datasets from NLTK
# These datasets include:
# - 'punkt': used for tokenizing text into sentences and words
# - 'stopwords': a list of common English stopwords to be removed during text processing
# - 'wordnet': a lexical database for lemmatization
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset into a Pandas DataFrame
# The dataset a CSV file containing job descriptions and their corresponding labels.
data = pd.read_csv('single_response.csv')

# Preprocessing function to clean and prepare the text data
def preprocess_text(text):
    """
    Preprocesses a given text by:
    - Converting the text to lowercase
    - Tokenizing the text into words
    - Lemmatizing each word (converting words to their base form)
    - Removing stopwords and non-alphabetic tokens
    - Returning the processed text as a single string
    
    Args:
    text (str): The input text to preprocess.
    
    Returns:
    str: The preprocessed text.
    """
    lemmatizer = WordNetLemmatizer()  # Initialize the WordNet lemmatizer
    tokens = word_tokenize(text.lower())  # Convert text to lowercase and tokenize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stopwords.words('english')]  # Lemmatize and remove stopwords
    return ' '.join(tokens)  # Join tokens back into a single string

# Apply the preprocessing function to the job descriptions
# The column 'Job description' is expected to contain the raw text data.
data['description'] = data['Job description'].apply(preprocess_text)

# Print unique labels in the dataset to check for discrepancies
# This step helps ensure that all labels are accounted for and correctly mapped.
print("Unique labels in the dataset before mapping:", data['Label'].unique())

# Update the label mapping to match the labels in your dataset
# This dictionary maps each label to a unique integer, which is required for model training.
label_mapping = {
    'Response A': 0,
    'Response B': 1,
    'Response C': 2,
    'Response D': 3,
    'Response E': 4,  # Adjust these mappings based on your dataset's labels
    'Response F': 5   # Add or remove as necessary
}

# Apply the label mapping to the 'Label' column
data['Label'] = data['Label'].map(label_mapping)

# Check for any missing values in the 'Label' column after mapping
if data['Label'].isnull().any():
    # If there are missing values, print a message and the corresponding rows for further investigation.
    print("There are missing values in the Label column. Please check the data.")
    print(data[data['Label'].isnull()])
else:
    # Split the data into training and testing sets
    # X_train, X_test: Training and testing features (job descriptions)
    # y_train, y_test: Training and testing labels
    X_train, X_test, y_train, y_test = train_test_split(data['description'], data['Label'], test_size=0.2, random_state=42)

    # Define pipelines for different classifiers
    # A pipeline sequentially applies a list of transforms and a final estimator.
    pipelines = {
        'Logistic Regression': Pipeline([
            ('tfidf', TfidfVectorizer()),  # Converts text data to TF-IDF features
            ('clf', LogisticRegression(class_weight='balanced'))  # Logistic Regression classifier with balanced class weights
        ]),
        'SVM': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', SVC(class_weight='balanced', probability=True))  # Support Vector Machine with probability estimates
        ]),
        'Random Forest': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', RandomForestClassifier(class_weight='balanced'))  # Random Forest classifier
        ])
    }

    # Define parameter grids for hyperparameter tuning using GridSearchCV
    # These grids specify the hyperparameters to be tuned for each classifier.
    param_grids = {
        'Logistic Regression': {
            'tfidf__max_df': [0.8, 0.9, 1.0],  # Maximum document frequency for terms
            'tfidf__min_df': [1, 2, 3],  # Minimum document frequency for terms
            'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],  # N-gram ranges to consider
            'clf__C': [0.1, 1, 10],  # Inverse regularization strength for Logistic Regression
            'clf__solver': ['liblinear', 'lbfgs']  # Solvers to use for Logistic Regression
        },
        'SVM': {
            'tfidf__max_df': [0.8, 0.9, 1.0],
            'tfidf__min_df': [1, 2, 3],
            'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
            'clf__C': [0.1, 1, 10],  # Regularization parameter for SVM
            'clf__kernel': ['linear', 'rbf']  # Kernel types to use in the algorithm
        },
        'Random Forest': {
            'tfidf__max_df': [0.8, 0.9, 1.0],
            'tfidf__min_df': [1, 2, 3],
            'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
            'clf__n_estimators': [50, 100, 200]  # Number of trees in the forest
        }
    }

    # Initialize variables to keep track of the best model
    best_models = {}
    best_accuracy = 0
    best_model_name = None
    best_model = None

    # Perform GridSearchCV for each model in the pipelines dictionary
    for model_name in pipelines:
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(pipelines[model_name], param_grids[model_name], cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)  # Fit the model using the training data
        best_models[model_name] = grid_search.best_estimator_  # Store the best estimator for each model
        accuracy = grid_search.best_score_  # Retrieve the best accuracy score
        print(f"{model_name} best params: {grid_search.best_params_}")
        print(f"{model_name} best accuracy: {accuracy}")

        # Evaluate each model on the test set
        y_pred = best_models[model_name].predict(X_test)
        print(f'\n{model_name} classification report on test set:')
        print(classification_report(y_test, y_pred))

        # Check if this model has the highest accuracy so far
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = model_name
            best_model = best_models[model_name]

    # Output the best model and its accuracy
    print(f"\nBest model is {best_model_name} with accuracy of {best_accuracy}")

    # Function to predict responses for new job descriptions using the best model
    def predict_responses(job_descriptions):
        """
        Predicts the response category for a list of job descriptions.
        
        Args:
        job_descriptions (list): A list of job descriptions to classify.
        
        Returns:
        list: Predicted response categories corresponding to the input job descriptions.
        """
        processed_descriptions = [preprocess_text(desc) for desc in job_descriptions]  # Preprocess the job descriptions
        predictions = best_model.predict(processed_descriptions)  # Predict using the best model
        reverse_label_mapping = {v: k for k, v in label_mapping.items()}  # Reverse the label mapping for readable output
        return [reverse_label_mapping[prediction] for prediction in predictions]  # Return human-readable predictions

    # Example usage of the prediction function
    job_descriptions = [
        "Manage university financial reports and budget forecasting.",
        "Assist in organizing office files and managing schedules."
    ]
    predicted_responses = predict_responses(job_descriptions)
    for job_desc, response in zip(job_descriptions, predicted_responses):
        print(f'Job Description: {job_desc}\nPredicted Response: {response}\n')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nosao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nosao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nosao\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unique labels in the dataset before mapping: ['Response C' 'Response B' 'Response A' 'Response D']
Training Logistic Regression...
Fitting 5 folds for each of 162 candidates, totalling 810 fits




Logistic Regression best params: {'clf__C': 10, 'clf__solver': 'lbfgs', 'tfidf__max_df': 0.8, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}
Logistic Regression best accuracy: 0.9385507246376813

Logistic Regression classification report on test set:
              precision    recall  f1-score   support

           1       0.71      0.83      0.77        18
           2       0.90      0.87      0.88        30
           3       1.00      0.78      0.88         9

    accuracy                           0.84        57
   macro avg       0.87      0.83      0.84        57
weighted avg       0.86      0.84      0.84        57

Training SVM...
Fitting 5 folds for each of 162 candidates, totalling 810 fits




SVM best params: {'clf__C': 10, 'clf__kernel': 'linear', 'tfidf__max_df': 0.8, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 1)}
SVM best accuracy: 0.9211594202898551

SVM classification report on test set:
              precision    recall  f1-score   support

           1       0.73      0.89      0.80        18
           2       0.93      0.87      0.90        30
           3       1.00      0.78      0.88         9

    accuracy                           0.86        57
   macro avg       0.89      0.84      0.86        57
weighted avg       0.88      0.86      0.86        57

Training Random Forest...
Fitting 5 folds for each of 81 candidates, totalling 405 fits




Random Forest best params: {'clf__n_estimators': 100, 'tfidf__max_df': 0.8, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}
Random Forest best accuracy: 0.9208695652173912

Random Forest classification report on test set:
              precision    recall  f1-score   support

           1       0.75      1.00      0.86        18
           2       1.00      0.87      0.93        30
           3       1.00      0.78      0.88         9

    accuracy                           0.89        57
   macro avg       0.92      0.88      0.89        57
weighted avg       0.92      0.89      0.90        57


Best model is Logistic Regression with accuracy of 0.9385507246376813
Job Description: Manage university financial reports and budget forecasting.
Predicted Response: Response C

Job Description: Assist in organizing office files and managing schedules.
Predicted Response: Response C



In [11]:
    # Function to predict the response for multiple job descriptions using the best model
def predict_responses(job_descriptions):
  processed_descriptions = [preprocess_text(desc) for desc in job_descriptions]
  predictions = best_model.predict(processed_descriptions)
  reverse_label_mapping = {v: k for k, v in label_mapping.items()}
  return [reverse_label_mapping[prediction] for prediction in predictions]  

# Example usage
job_descriptions = [
"Manage university financial reports and budget forecasting.",
"Assist in organizing office files and managing schedules.",
"To plan, manage and deliver student recruitment events based on-campus and virtually..",
"	To monitor, review and report on the impact and effectiveness of all student recruitment events…",
"The Director of PMO & Strategic Change will lead the delivery of the University’s strategic change portfolio and Programme Management Offices. The Director will work in partnership with senior leaders and key stakeholders across the institution to ensure the delivery of the strategic change programme that underpins delivery of the University’s strategic plan.",
"To contribute to continuing improvements to management and financial systems and to the maintenance of effective administration."

]
predicted_responses = predict_responses(job_descriptions)
for job_desc, response in zip(job_descriptions, predicted_responses):
  print(f'Job Description: {job_desc}\nPredicted Response: {response}\n')

Job Description: Manage university financial reports and budget forecasting.
Predicted Response: Response C

Job Description: Assist in organizing office files and managing schedules.
Predicted Response: Response C

Job Description: To plan, manage and deliver student recruitment events based on-campus and virtually..
Predicted Response: Response C

Job Description: 	To monitor, review and report on the impact and effectiveness of all student recruitment events…
Predicted Response: Response B

Job Description: The Director of PMO & Strategic Change will lead the delivery of the University’s strategic change portfolio and Programme Management Offices. The Director will work in partnership with senior leaders and key stakeholders across the institution to ensure the delivery of the strategic change programme that underpins delivery of the University’s strategic plan.
Predicted Response: Response A

Job Description: To contribute to continuing improvements to management and financial syst