# Employee Feedback Sentiment Analysis

In [1]:
!pip install transformers torch numpy pandas scikit-learn spacy textstat nltk

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.0 textstat-0.7.4


### Response sentiment analysis

In [2]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Example feedback data
feedback_data = [
    "I really enjoy the team activities and the work environment.",
    "I feel like my work has a deep impact, but it's stressful.",
    "I'm content with the management, but I don't find my tasks very engaging.",
    "I really appreciate the new flexible work policies.",
    "The management needs to be more transparent about decisions.",
    "This is the worst work environment I've experienced.",
    "Great, thanks.",
    "As an employee of this esteemed organization, I am consistently inspired by our collective commitment to excellence and innovation. It's truly gratifying to witness how our team's diverse expertise and unwavering dedication to delivering exceptional results shape the trajectory of both the company and the industry at large. The collaborative culture we foster encourages the exchange of bold ideas, while maintaining a keen focus on continuous improvement and operational efficiency. I am proud to be part of a forward-thinking organization that not only values personal growth but also invests in creating meaningful and lasting impact in the communities we serve."
]

# Analyze sentiment
sentiment_results = [sentiment_analyzer(feedback) for feedback in feedback_data]

# Display sentiment results
for i, result in enumerate(sentiment_results):
    print(f"Feedback: {feedback_data[i]}")
    print(f"Sentiment: {result[0]['label']}, Score: {result[0]['score']}\n")

# Leave only numeric sentiment results
sentiment_results_clean = [result[0]['score'] for result in sentiment_results]
print(sentiment_results_clean)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



Feedback: I really enjoy the team activities and the work environment.
Sentiment: POSITIVE, Score: 0.9998564720153809

Feedback: I feel like my work has a deep impact, but it's stressful.
Sentiment: POSITIVE, Score: 0.9902501702308655

Feedback: I'm content with the management, but I don't find my tasks very engaging.
Sentiment: NEGATIVE, Score: 0.997974693775177

Feedback: I really appreciate the new flexible work policies.
Sentiment: POSITIVE, Score: 0.9998334646224976

Feedback: The management needs to be more transparent about decisions.
Sentiment: NEGATIVE, Score: 0.9981219172477722

Feedback: This is the worst work environment I've experienced.
Sentiment: NEGATIVE, Score: 0.9998093247413635

Feedback: Great, thanks.
Sentiment: POSITIVE, Score: 0.9998507499694824

Feedback: As an employee of this esteemed organization, I am consistently inspired by our collective commitment to excellence and innovation. It's truly gratifying to witness how our team's diverse expertise and unwaveri

### Embeddings Creation

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import KMeans
import numpy as np

# Load tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Function to get sentence embeddings
def get_embeddings(text_list):
    embeddings = []
    for text in text_list:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
        outputs = model(**inputs)
        # Take the mean of the last hidden state
        embedding = torch.mean(outputs.last_hidden_state, dim=1).detach().numpy()
        embeddings.append(embedding.flatten())
    return np.array(embeddings)

# Get embeddings for feedback data
feedback_embeddings = get_embeddings(feedback_data)

# Perform topic modeling with KMeans
num_topics = 3  # Define the number of topics you want to extract
kmeans = KMeans(n_clusters=num_topics, random_state=42)
kmeans.fit(feedback_embeddings)

# Print topic assignments
for i, label in enumerate(kmeans.labels_):
    print(f"Feedback: {feedback_data[i]} -> Topic {label}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Feedback: I really enjoy the team activities and the work environment. -> Topic 2
Feedback: I feel like my work has a deep impact, but it's stressful. -> Topic 0
Feedback: I'm content with the management, but I don't find my tasks very engaging. -> Topic 0
Feedback: I really appreciate the new flexible work policies. -> Topic 2
Feedback: The management needs to be more transparent about decisions. -> Topic 0
Feedback: This is the worst work environment I've experienced. -> Topic 0
Feedback: Great, thanks. -> Topic 1
Feedback: As an employee of this esteemed organization, I am consistently inspired by our collective commitment to excellence and innovation. It's truly gratifying to witness how our team's diverse expertise and unwavering dedication to delivering exceptional results shape the trajectory of both the company and the industry at large. The collaborative culture we foster encourages the exchange of bold ideas, while maintaining a keen focus on continuous improvement and operat

### Conductivity score

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Predefined meaningful feedback for comparison (templates)
meaningful_feedback_templates = [
    "The policies in place are effective and beneficial for the team.",
    "Management should improve transparency for better team morale.",
    "The work environment is supportive and collaborative.",
    "The CEO's leadership is generally effective, though there are areas where more employee engagement could be beneficial.",
    "I feel reasonably satisfied with my job, although there are some aspects that could be improved.",
    "My work has a purpose, though it's not always clear how my contributions fit into the bigger picture.",
    "I feel okay at work most of the time, though there are both positive and challenging days.",
    "There are some stressful aspects to my job, but I am generally able to manage them effectively.",
    "My compensation seems fair for the most part, though there could be adjustments to better reflect my responsibilities.",
    "I feel that some people at work are supportive, though it would be helpful to have a stronger network of encouragement.",
    "I feel appreciated by some colleagues, though not everyone may recognize my contributions.",
    "I feel I can trust some of my coworkers, though trust could be stronger across the entire team.",
    "I feel somewhat connected to my company, but there’s room for building a stronger sense of belonging.",
    "My manager provides some support, though there could be more consistent guidance to help me succeed.",
    "The work environment is generally inclusive, though there are occasional gaps in ensuring everyone feels respected.",
    "My job has some flexibility in terms of time and location, though additional options could improve my work-life balance.",
    "I feel neutral about most of my work tasks; some are energizing, while others are just routine.",
    "I am making progress towards my goals, though there are some areas where additional support could help.",
    "There are occasional learning opportunities, though they’re not as frequent as I would like.",

    "The CEO demonstrates a strong commitment to both company success and employee well-being, which is something I wholeheartedly support.",
    "I am highly satisfied with my job as it offers a balance of challenging tasks and personal fulfillment.",
    "My work aligns with a clear purpose, which motivates me to give my best each day.",
    "I generally feel positive and content at work, which greatly enhances my productivity and overall outlook.",
    "High stress levels are a frequent part of my job, which can sometimes feel overwhelming and impact my work-life balance.",
    "I believe my compensation is fair and aligns with the value I bring to the company.",
    "There is a strong support network at my workplace, with colleagues and leaders who provide encouragement when needed.",
    "I feel genuinely valued by my coworkers, who see and appreciate me as a person beyond just my role.",
    "Trust is an important aspect of my workplace, and I feel comfortable relying on my colleagues and leaders.",
    "There is a strong sense of belonging in my company, which makes me feel connected and valued as part of the team.",
    "My manager is actively invested in my success, providing guidance and resources to help me excel.",
    "Our work environment is inclusive and respectful, making it a safe space for all team members to contribute.",
    "My role offers the flexibility in timing and location that I need to manage both work and personal commitments effectively.",
    "Most of my tasks are engaging and give me a boost of energy, which helps me stay motivated and productive.",
    "I am making good progress towards achieving my goals, which keeps me focused and driven at work.",
    "There is always an opportunity to learn and grow in my role, which adds value to my professional development.",

    "The CEO's actions often seem disconnected from employee needs, which makes it difficult for me to fully support their leadership.",
    "I am not fully satisfied with my job, as it lacks the fulfillment and growth opportunities I am seeking.",
    "My work often feels purposeless, making it hard to stay motivated and engaged.",
    "I rarely feel genuinely happy at work, which affects my overall attitude and productivity.",
    "I frequently feel stressed at work, and this high level of pressure impacts my well-being.",
    "I feel that I am underpaid for the work I do, which leaves me feeling undervalued.",
    "Support and encouragement from colleagues are lacking, and I often feel isolated in my role.",
    "I don't feel appreciated as a person, and my contributions often go unnoticed.",
    "Trust among my colleagues is low, making it difficult to rely on others within the company.",
    "I often feel disconnected from my team, which makes it hard to establish a sense of belonging.",
    "My manager does not provide the support I need to succeed, which limits my potential for growth.",
    "The work environment lacks inclusivity, and respect for different backgrounds is not consistently practiced.",
    "I struggle with the lack of flexibility in my job, which makes it hard to balance my work and personal life.",
    "Many of my tasks drain my energy, and I rarely feel enthusiastic about my daily responsibilities.",
    "I am not achieving most of my goals at work, which makes me feel stagnant in my role.",
    "Opportunities to learn are infrequent, and I rarely feel that I'm growing professionally in this position."
]
template_embeddings = get_embeddings(meaningful_feedback_templates)

# Function to calculate meaningfulness score
def conductivity_score(feedback_embeddings, template_embeddings):
    scores = []
    for feedback in feedback_embeddings:
        # Calculate similarity to the templates and average the score
        similarities = cosine_similarity([feedback], template_embeddings).flatten()
        meaningfulness = similarities.mean()
        scores.append(meaningfulness)
    return scores

# Calculate meaningfulness scores for feedback
meaningfulness_scores = conductivity_score(feedback_embeddings, template_embeddings)

for (score, feedback) in zip(meaningfulness_scores, feedback_data):
    print(f"Feedback: {feedback}")
    print(f"Meaninfulness Score: {score:.2f}\n")

Feedback: I really enjoy the team activities and the work environment.
Meaninfulness Score: 0.78

Feedback: I feel like my work has a deep impact, but it's stressful.
Meaninfulness Score: 0.82

Feedback: I'm content with the management, but I don't find my tasks very engaging.
Meaninfulness Score: 0.82

Feedback: I really appreciate the new flexible work policies.
Meaninfulness Score: 0.79

Feedback: The management needs to be more transparent about decisions.
Meaninfulness Score: 0.76

Feedback: This is the worst work environment I've experienced.
Meaninfulness Score: 0.76

Feedback: Great, thanks.
Meaninfulness Score: 0.57

Feedback: As an employee of this esteemed organization, I am consistently inspired by our collective commitment to excellence and innovation. It's truly gratifying to witness how our team's diverse expertise and unwavering dedication to delivering exceptional results shape the trajectory of both the company and the industry at large. The collaborative culture we f

### Complexity Analysis

In [6]:
import nltk
nltk.download('punkt')  # Download the punkt tokenizer model
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

import spacy
import textstat
from transformers import pipeline
from nltk import word_tokenize, pos_tag, FreqDist
from nltk.corpus import stopwords
import math

# Initialize Spacy model for syntactic analysis
nlp = spacy.load('en_core_web_sm')

# Use HuggingFace's transformers pipeline for NER or sentiment analysis
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Lexical Richness: Type-Token Ratio
def type_token_ratio(text):
    tokens = word_tokenize(text.lower())
    types = set(tokens)
    return len(types) / len(tokens) if len(tokens) > 0 else 0

# Lexical Density: Content words vs function words
def lexical_density(text):
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    content_words = [word for word in tokens if word not in stop_words]
    return len(content_words) / len(tokens) if len(tokens) > 0 else 0

# Semantic Depth: Using NER to check for complex terms or entities
def semantic_depth(text):
    ner_results = ner_pipeline(text)
    return len(ner_results)  # Count of named entities

# Syntactic Complexity: Average sentence length and parse tree depth
def syntactic_complexity(text):
    doc = nlp(text)
    sentence_lengths = [len(sentence) for sentence in doc.sents]
    avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0

    # Measure syntactic depth by counting the maximum depth of syntactic trees
    max_depth = max([len(list(token.subtree)) for token in doc])  # Maximum depth of dependency tree
    return avg_sentence_length, max_depth

# Combine all factors into a sophistication score
def sophistication_score(text):
    # Lexical richness
    ttr = type_token_ratio(text)
    lexical_d = lexical_density(text)

    # Semantic depth
    sem_depth = semantic_depth(text)

    # Syntactic complexity
    avg_sentence_length, syntactic_depth = syntactic_complexity(text)

    # Calculate final score (normalized for simplicity)
    score = (ttr * 0.25 + lexical_d * 0.25 + sem_depth * 0.25 + (avg_sentence_length + syntactic_depth) * 0.25)
    score = max(0, min(100, score))  # Keep it within range [0, 100]

    return score

complexity_scores = [sophistication_score(feedback) for feedback in feedback_data]

# Example of integrating text complexity into analysis
for (score, feedback) in zip(complexity_scores, feedback_data):
    print(f"Feedback: {feedback}")
    print(f"Complexity Score: {score:.2f}\n")

normal_complexity = (complexity_scores - np.min(complexity_scores)) / (np.max(complexity_scores) - np.min(complexity_scores) + 1e-5)
print(normal_complexity)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

Feedback: I really enjoy the team activities and the work environment.
Complexity Score: 5.91

Feedback: I feel like my work has a deep impact, but it's stressful.
Complexity Score: 7.92

Feedback: I'm content with the management, but I don't find my tasks very engaging.
Complexity Score: 8.90

Feedback: I really appreciate the new flexible work policies.
Complexity Score: 4.97

Feedback: The management needs to be more transparent about decisions.
Complexity Score: 5.40

Feedback: This is the worst work environment I've experienced.
Complexity Score: 5.45

Feedback: Great, thanks.
Complexity Score: 2.50

Feedback: As an employee of this esteemed organization, I am consistently inspired by our collective commitment to excellence and innovation. It's truly gratifying to witness how our team's diverse expertise and unwavering dedication to delivering exceptional results shape the trajectory of both the company and the industry at large. The collaborative culture we foster encourages the 

In [7]:
# import textstat

# # Function to add text complexity check
# def complexity_analysis(feedback):
#     return textstat.flesch_reading_ease(feedback)

# # Calculate text complexity scores
# complexity_scores = [complexity_analysis(feedback) for feedback in feedback_data]

# # Example of integrating text complexity into analysis
# for (score, feedback) in zip(complexity_scores, feedback_data):
#     print(f"Feedback: {feedback}")
#     print(f"Complexity Score: {score:.2f}\n")

### Weighted Scoring

In [11]:
def weighted_score(sentiment_results, conductivity_scores, complexity_scores, alpha=0.5, beta=0.3, gamma=0.2):
    """
    Calculates a weighted score using sentiment, conductivity, and complexity scores.

    Parameters:
    - sentiment_results: List of sentiment analysis results (dicts).
    - conductivity_scores: List of pre-calculated conductivity scores (floats).
    - complexity_scores: List of pre-calculated complexity scores (floats).
    - alpha, beta, gamma: Weights for sentiment, conductivity, and complexity, respectively.

    Returns:
    - List of combined weighted scores (floats).
    """
    # Normalize conductivity and complexity scores to a range of 0 to 1
    norm_conductivity_scores = (conductivity_scores - np.min(conductivity_scores)) / (np.max(conductivity_scores) - np.min(conductivity_scores) + 1e-5)
    norm_complexity_scores = (complexity_scores - np.min(complexity_scores)) / (np.max(complexity_scores) - np.min(complexity_scores) + 1e-5)

    # # Invert complexity scores so that higher complexity yields a higher contribution
    # inverted_complexity_scores = 1 - norm_complexity_scores

    combined_scores = []
    for i, result in enumerate(sentiment_results):
        sentiment_label = result[0]['label']
        sentiment_score = result[0]['score']

        # Adjust sentiment score based on label
        if sentiment_label == 'POSITIVE':
            sentiment_weighted = sentiment_score  # Positive score as is
        elif sentiment_label == 'NEGATIVE':
            sentiment_weighted = 1 - sentiment_score  # Negative score inverted
        else:
            sentiment_weighted = sentiment_score * 0.5  # Neutral (less impactful)

        # Ensure the sentiment score is scaled between 0 and 1
        sentiment_weighted = max(0, min(1, sentiment_weighted))

        # Calculate final weighted score using the provided formula
        final_score = (alpha * sentiment_weighted +
                       beta * norm_conductivity_scores[i] +
                       gamma * norm_complexity_scores[i])

        # Normalize the final score to ensure it stays between 0 and 1
        final_score = max(0, min(1, final_score))

        combined_scores.append(final_score)

    return combined_scores

# Call the function
weighted_scores = weighted_score(sentiment_results, meaningfulness_scores, complexity_scores, alpha = 0.1, beta=0.5, gamma=0.4)
for i, score in enumerate(weighted_scores):
    print(f"Feedback {i+1}: {feedback_data[i]}")
    print(f"Sensitivity Score (raw) = {sentiment_results[i][0]['label']}: {sentiment_results[i][0]['score']:.4f}")
    print(f"Conductivity Score (raw) = {meaningfulness_scores[i]:.2f}")
    print(f"Complexity Score (raw) = {complexity_scores[i]:.2f}")
    print(f"-----------------= RESULT =-----------------")
    print(f"Weighted Score (normalized) = {score:.2f}\n")

Feedback 1: I really enjoy the team activities and the work environment.
Sensitivity Score (raw) = POSITIVE: 0.9999
Conductivity Score (raw) = 0.78
Complexity Score (raw) = 5.91
-----------------= RESULT =-----------------
Weighted Score (normalized) = 0.63

Feedback 2: I feel like my work has a deep impact, but it's stressful.
Sensitivity Score (raw) = POSITIVE: 0.9903
Conductivity Score (raw) = 0.82
Complexity Score (raw) = 7.92
-----------------= RESULT =-----------------
Weighted Score (normalized) = 0.76

Feedback 3: I'm content with the management, but I don't find my tasks very engaging.
Sensitivity Score (raw) = NEGATIVE: 0.9980
Conductivity Score (raw) = 0.82
Complexity Score (raw) = 8.90
-----------------= RESULT =-----------------
Weighted Score (normalized) = 0.70

Feedback 4: I really appreciate the new flexible work policies.
Sensitivity Score (raw) = POSITIVE: 0.9998
Conductivity Score (raw) = 0.79
Complexity Score (raw) = 4.97
-----------------= RESULT =----------------

### Outlier Detection

In [12]:
def detect_outliers(weighted_scores, method="iqr", threshold=1.5):
    """
    Detects outliers based on the weighted scores using the specified method.

    Parameters:
    - weighted_scores: List of combined weighted scores (floats).
    - method: The method to use for outlier detection ('iqr' or 'z-score').
    - threshold: The threshold for outlier detection (default is 1.5 for IQR).

    Returns:
    - List of booleans indicating whether each score is an outlier (True) or not (False).
    """
    adjusted_scores = weighted_scores.copy()

    if method == "iqr":
        # Calculate the first and third quartile (Q1, Q3)
        Q1 = np.percentile(weighted_scores, 25)
        Q3 = np.percentile(weighted_scores, 75)
        IQR = Q3 - Q1

        # Define the lower and upper bounds for outliers
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        # Adjust outliers
        for i, score in enumerate(weighted_scores):
            if score < lower_bound:
                # Set to 0.1 if closer to the lower bound
                adjusted_scores[i] = 0.1
            elif score > upper_bound:
                # Set to 0.9 if closer to the upper bound
                adjusted_scores[i] = 0.9

    elif method == "z-score":
        # Calculate the mean and standard deviation
        mean = np.mean(weighted_scores)
        std_dev = np.std(weighted_scores)

        # Adjust outliers based on the z-score
        for i, score in enumerate(weighted_scores):
            z_score = abs((score - mean) / (std_dev + 1e-5))
            if z_score > threshold:
                # Check if the score is closer to 0.1 or 0.9 and adjust accordingly
                if score < mean:
                    adjusted_scores[i] = 0.1
                else:
                    adjusted_scores[i] = 0.9

    else:
        raise ValueError("Invalid method specified. Use 'iqr' or 'z-score'.")

    return adjusted_scores

# Display results
outliers = detect_outliers(weighted_scores)

for i, score in enumerate(outliers):
    print(f"Original Score: {weighted_scores[i]} - Adjusted Score: {score}")

Original Score: 0.6330691469021907 - Adjusted Score: 0.6330691469021907
Original Score: 0.7637617955164586 - Adjusted Score: 0.7637617955164586
Original Score: 0.697506448817217 - Adjusted Score: 0.697506448817217
Original Score: 0.6078516655755482 - Adjusted Score: 0.6078516655755482
Original Score: 0.459872359362268 - Adjusted Score: 0.459872359362268
Original Score: 0.4660499389023341 - Adjusted Score: 0.4660499389023341
Original Score: 0.09998507499694825 - Adjusted Score: 0.09998507499694825
Original Score: 0.8993017628719056 - Adjusted Score: 0.8993017628719056


### Separation into themes

In [13]:
# Load a pre-trained zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define the categories for classification
categories = [
    "Happiness: How enjoyable people find their day-to-day life at work",
    "Purpose: How meaningful people find their work",
    "Satisfaction: How content people feel with the way things are at work",
    "Stress-free: How manageable people find their work stress"
]

def categorize_feedback(feedback_texts):
    """
    Classifies feedback texts into predefined categories.

    Parameters:
    - feedback_texts: List of feedback texts to be classified.

    Returns:
    - List of dictionaries with category scores for each feedback.
    """
    categorized_results = []
    for feedback in feedback_texts:
        result = classifier(feedback, categories, multi_label=True)
        # Prepare a dictionary mapping each category to its score
        feedback_categories = dict(zip(result['labels'], result['scores']))
        categorized_results.append(feedback_categories)

    return categorized_results

# Example usage
feedback_texts = [
    "I really enjoy the team activities and the work environment.",
    "I feel like my work has a deep impact, but it's stressful.",
    "I'm content with the management, but I don't find my tasks very engaging.",
    "I really appreciate the new flexible work policies.",
    "The management needs to be more transparent about decisions.",
    "This is the worst work environment I've experienced.",
    "Great, thanks."
]

categorized_feedbacks = categorize_feedback(feedback_texts)

# Display categorized feedback
for i, feedback in enumerate(feedback_texts):
    print(f"Feedback: \"{feedback}\"")
    print("Category scores:")
    for category, score in categorized_feedbacks[i].items():
        print(f"  {category}: {score:.2f}")
    print()

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Feedback: "I really enjoy the team activities and the work environment."
Category scores:
  Satisfaction: How content people feel with the way things are at work: 0.99
  Happiness: How enjoyable people find their day-to-day life at work: 0.96
  Stress-free: How manageable people find their work stress: 0.47
  Purpose: How meaningful people find their work: 0.42

Feedback: "I feel like my work has a deep impact, but it's stressful."
Category scores:
  Purpose: How meaningful people find their work: 0.24
  Satisfaction: How content people feel with the way things are at work: 0.02
  Happiness: How enjoyable people find their day-to-day life at work: 0.01
  Stress-free: How manageable people find their work stress: 0.00

Feedback: "I'm content with the management, but I don't find my tasks very engaging."
Category scores:
  Satisfaction: How content people feel with the way things are at work: 0.98
  Happiness: How enjoyable people find their day-to-day life at work: 0.66
  Stress-free: H

In [17]:
import json
import numpy as np

def select_best_categories(categorized_feedbacks, feedback_texts, threshold_method='percentile', percentile=50, top_n=3):
    """
    Selects the best categories from categorized feedback based on dynamic thresholds
    and formats the output as a JSON object with cleaned category labels.

    Parameters:
    - categorized_feedbacks: List of dictionaries with category scores for each feedback.
    - feedback_texts: List of feedback text corresponding to the scores.
    - threshold_method: Method to determine the threshold ('percentile' or 'static').
    - percentile: The percentile threshold to use for selecting categories (for 'percentile' method).
    - top_n: The maximum number of categories to select for each feedback.

    Returns:
    - JSON formatted string with feedback and cleaned selected categories.
    """
    selected_feedback_data = []

    for i, feedback_categories in enumerate(categorized_feedbacks):
        # Get all the category scores
        category_scores = list(feedback_categories.values())

        if threshold_method == 'percentile':
            # Calculate the dynamic threshold based on the given percentile
            dynamic_threshold = np.percentile(category_scores, percentile)
            selected_categories = [category.split(':')[0].strip() for category, score in feedback_categories.items() if score >= dynamic_threshold]

        elif threshold_method == 'static':
            # Use a static threshold (e.g., 0.8)
            static_threshold = 0.8
            selected_categories = [category.split(':')[0].strip() for category, score in feedback_categories.items() if score >= static_threshold]

        if not selected_categories:
            # If no category meets the threshold, choose the top N categories based on the highest scores
            sorted_categories = sorted(feedback_categories.items(), key=lambda x: x[1], reverse=True)
            selected_categories = [category.split(':')[0].strip() for category, score in sorted_categories[:top_n]]

        # Append to the result list as a dictionary
        selected_feedback_data.append({
            "feedback": feedback_texts[i],
            "categories": selected_categories
        })

    # Convert to JSON format
    return json.dumps(selected_feedback_data, indent=2)

# # Get the JSON formatted result
# json_result = select_best_categories(categorized_feedbacks, feedback_data)
# print(json_result)

# Example with percentile thresholding
output = select_best_categories(categorized_feedbacks, feedback_data, threshold_method='percentile', percentile=60)
print("DYNAMIC PERCENTILE\n")
print(output)
print()

# Example with static threshold
output_static = select_best_categories(categorized_feedbacks, feedback_data, threshold_method='static')
print("STATIC PERCENTILE\n")
print(output_static)
print()

DYNAMIC PERCENTILE

[
  {
    "feedback": "I really enjoy the team activities and the work environment.",
    "categories": [
      "Satisfaction",
      "Happiness"
    ]
  },
  {
    "feedback": "I feel like my work has a deep impact, but it's stressful.",
    "categories": [
      "Purpose",
      "Satisfaction"
    ]
  },
  {
    "feedback": "I'm content with the management, but I don't find my tasks very engaging.",
    "categories": [
      "Satisfaction",
      "Happiness"
    ]
  },
  {
    "feedback": "I really appreciate the new flexible work policies.",
    "categories": [
      "Satisfaction",
      "Stress-free"
    ]
  },
  {
    "feedback": "The management needs to be more transparent about decisions.",
    "categories": [
      "Satisfaction",
      "Purpose"
    ]
  },
  {
    "feedback": "This is the worst work environment I've experienced.",
    "categories": [
      "Satisfaction",
      "Happiness"
    ]
  },
  {
    "feedback": "Great, thanks.",
    "categories": 