In [1]:
###Getting synonyms for words related to each category: work-life balance, culture and values, career opportunities, compensation and benefits, senior management
import nltk
from nltk.corpus import wordnet
from gensim.models import KeyedVectors

# Make sure to download NLTK resources if not already installed
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define initial words for each factor
factor_words = {
    "work_life_balance": [
        "work life", "working hour", "overtime", "working on the weekend", "balance",
        "private life", "family", "maternity", "paternity", "vacation", "annual leave",
        "refresh", "monthly leave", "pto", "paid time", "holiday", "schedule",
    ],
    "culture_values": [
        "culture", "values", "organizational culture", "firm culture", "atmosphere",
        "oppression", "friend", "free food", "free lunch", "politics", "stress", "cowork"
    ],
    "career_opportunities": [
        "career opportunity", "advancement", "career growth", "promotion", "career development",
        "development program", "train", "skills"
    ],
    "compensation_benefits": [
        "compensation", "benefits", "pay", "remuneration", "wage", "money", "salary", "401k",
        "fund", "package", "reimbursement", "tuition", "insurance", "lay off", "fire"
    ],
    "senior_management": [
        "management", "leadership", "upper management", "team leader", "executives", "CEO"
    ]
}

!pip install gensim

import gensim.downloader as api

# Download the pre-trained model
word_vectors = api.load('word2vec-google-news-300')

# Save the model locally
word_vectors.save("word2vec.model")

word_vectors = KeyedVectors.load("word2vec.model")

# Function to get synonyms using WordNet
def get_wordnet_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())  # Add synonym lemma to set
    return synonyms

# Function to get similar words using Word2Vec
def get_word2vec_similar_words(word, topn=10):
    try:
        return word_vectors.most_similar(word, topn=topn)
    except KeyError:
        # If the word isn't in the Word2Vec vocabulary, return an empty list
        return []

# Function to expand factor keywords using both WordNet and Word2Vec
def expand_keywords(factor_words):
    expanded_words = {}
    for category, words in factor_words.items():
        expanded_words[category] = set()

        for word in words:
            # Get WordNet synonyms for the word
            wordnet_synonyms = get_wordnet_synonyms(word)
            expanded_words[category].update(wordnet_synonyms)

            # Get Word2Vec similar words for the word
            word2vec_similar_words = get_word2vec_similar_words(word)
            for similar_word, _ in word2vec_similar_words:
                expanded_words[category].add(similar_word)

        # Convert to list and remove duplicates (set already removes duplicates)
        expanded_words[category] = list(expanded_words[category])

    return expanded_words

# Expand the keywords for all factors
expanded_factor_keywords = expand_keywords(factor_words)

# Print the expanded keywords for each factor
for category, words in expanded_factor_keywords.items():
    print(f"{category}: {words[:10]}...")  # Printing the first 10 words for brevity


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


work_life_balance: ['home', 'folk', 'vacation', 'kinsfolk', 'symmetricalness', 'midwifery', 'paternity_suit', 'poise', 'vacations', 'PTO']...
culture_values: ['cultural_milieu', 'cultivation', 'esteem', 'vibe', 'buddy', 'ideals', 'atmospheres', 'tension', 'ethos', 'civilization']...
career_opportunities: ['promotion', 'trail', 'take_aim', 'acquirement', 'atomically_precise_manufacturing', 'locomotive', 'Nuria_Ortiz', 'bus', 'attainment', 'promotions']...
compensation_benefits: ['benefit', 'advantages', 'signing_bonus', 'investment_firm', 'Fuwei_BOPET_film', 'engage', 'http://www.viropharma.com/docs/pulvules_pi.pdf', 'Accrued_liabilities_Compensation', 'money', 'reimburses']...
senior_management: ['Chief_Operating_Officer', 'manangement', 'chief_operating_officer', 'executive', 'Execs', 'managementand', 'bankers', 'Managing_Director', 'corporate_chieftains', 'managment']...


In [2]:
for category, words in expanded_factor_keywords.items():
    print(f"{category}: {words}...")

work_life_balance: ['home', 'folk', 'vacation', 'kinsfolk', 'symmetricalness', 'midwifery', 'paternity_suit', 'poise', 'vacations', 'PTO', 'docket', 'nonconference_schedule', 'mother', 'shedule', 'pregnancy', 'syndicate', 'proportion', 'itinerary', 'class', 'mower_conditioner', 'Zyprexa_Cymbalta_Humalog', 'rest', 'house', 'crime_syndicate', 'rejuvenate', 'refresh', 'Paternity', 'paternity', 'unappropriated_fund', 'Olshey_joked', 'equilibrium', 'honeymoon', 'review', 'counterweight', 'gestation', 'updating', 'refreshen', 'obstetrics', 'Garrett_Hartley_readies', 'balancing', 'ballscrews', 'menage', 'mob', 'sked', 'Xmas', 'nonconference_slate', 'relatives', 'maternity', 'Maternity', 'correspondence', 'balance_wheel', 'Watch_6News', 'remainder', 'family', 'BROOKLYN_MICH', 'delicate_balance', 'equipoise', 'OT', 'Boeing_machinist_earns', 'tweak', 'spousal', 'parentage', 'powershift_transmission', 'counterpoise', 'holiday', 'electro_hydraulic', 'variator', 'holidays', 'siblings', 'maternal_qu

In [3]:
# Function to get synonyms for individual words
def get_wordnet_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())  # Add synonym lemma to set
    return synonyms

# Function to handle multi-word phrases
def get_phrase_synonyms(phrase):
    words = phrase.split()  # Split phrase into individual words
    phrase_synonyms = set()
    for word in words:
        phrase_synonyms.update(get_wordnet_synonyms(word))  # Get synonyms for each word
    return phrase_synonyms

for category, words in factor_words.items():
    print(f"Synonyms for category '{category}':")
    for word in words:
        synonyms = get_phrase_synonyms(word)
        print(f"  {word}: {synonyms}")
    print("\n")

Synonyms for category 'work_life_balance':
  work life: {'put_to_work', 'exercise', 'exploit', 'life_story', 'mold', 'crop', 'biography', 'ferment', 'work', 'knead', 'bring', 'play', 'lick', 'life', 'do_work', 'mould', 'work_on', 'solve', 'sprightliness', 'employment', 'run', 'form', 'lifetime', 'body_of_work', 'act', 'function', 'life-time', 'make', 'wreak', 'puzzle_out', 'turn', 'process', 'shape', 'forge', 'sour', 'liveliness', 'cultivate', 'work_out', 'aliveness', 'life_sentence', 'spirit', 'piece_of_work', 'life_history', 'make_for', 'animation', 'influence', 'go', 'figure_out', 'living', 'act_upon', 'lifespan', 'study', 'oeuvre', 'operate', 'workplace'}
  working hour: {'put_to_work', 'exercise', 'exploit', 'mold', 'crop', 'hr', 'ferment', 'work', 'knead', 'bring', 'play', 'lick', 'workings', 'do_work', 'mould', 'work_on', 'solve', 'run', 'running', 'form', 'act', 'function', 'working', 'make', 'wreak', 'puzzle_out', 'turn', 'minute', 'on_the_job', 'process', 'shape', '60_minutes

In [1]:

from collections import Counter

# Define your categories
categories = {
    "work_life_balance": [
        'work life', 'fellowship', 'holidays', 'overtime', 'schedule', 'honeymoon',
        'maternity',  'home',    'festive', 'remote',
        'pregnancy',  'motherhood',   'pto','wlb', 'flexible', 'work-life',
          'vacay',   'power_takeoff',  'wfh',
        'Paternity', 'household', 'vacation', 'children',
         'getaway',  'private_life', 'weekend','hybrid'
    ],
    "culture_values": [
        'culture',  'oppress', 'anxiety', 'Stress', 'tenseness',
            'principles', 'colleagues',
             'civilisation',
         'subjugation', 'focus',  'bureaucracy'
        'atmoshpere',     'politics', 'family',
          'time_value',
         'vibe',  'morals',  'coaching','mentor'
         'traditions',  'diversity',
        'ambiance', 'injustice', 'persecution', 'free_food', 'environment',
        'free_lunch', 'party',  'communication',  'training', 'community'
    ],
    "career_opportunities": [
        'opportunities', 'groom',
         'publicity',  'development',
        'educate', 'promotion', 'skill',  'growth',
        'program', 'technology','learn','network'
    ],
    "compensation_benefits": [
        'compensation', 'benefits',  'wages',
         'tuition',  'Salary', 'stipend',
        'pay',   'layoff',
        'vast_sums',  'Monies',  'fund',
        'package', 'insurance', 'savings', 'allowance',
        'reimbursement', 'severance',  'bonus', 'reimbursed',
        'remunerations',  'Money', 'raise', 'give_the_axe', 'increment',
         'fire', 'hike', 'medical','perk', 'subsidy', 'pension',
        'Medicaid', 'investment_trust', 'fuel',  '401K'
    ],
    "senior_management": [
         'mgmt',
          'leader',
         'CFO',  'officer',
         'manage',   'COO', 'Execs',
        'organizational_structure',   'CEOs',
        'Chairman', 'director'
    ]
}

# Convert all words to lowercase and deduplicate within each category
for category, words in categories.items():
    categories[category] = list(set([word.lower() for word in words]))

# Check for duplicates across categories
all_words = [word for words in categories.values() for word in words]
duplicates = [word for word, count in Counter(all_words).items() if count > 1]

# Print results
print("Updated Categories with Lowercase Words and Deduplication:")
for category, words in categories.items():
    print(f"{category}: {sorted(words)}\n")

print("Duplicates Across Categories:")
print(duplicates)

Updated Categories with Lowercase Words and Deduplication:
work_life_balance: ['children', 'fellowship', 'festive', 'flexible', 'getaway', 'holidays', 'home', 'honeymoon', 'household', 'hybrid', 'maternity', 'motherhood', 'overtime', 'paternity', 'power_takeoff', 'pregnancy', 'private_life', 'pto', 'remote', 'schedule', 'vacation', 'vacay', 'weekend', 'wfh', 'wlb', 'work life', 'work-life']

culture_values: ['ambiance', 'anxiety', 'bureaucracyatmoshpere', 'civilisation', 'coaching', 'colleagues', 'communication', 'community', 'culture', 'diversity', 'environment', 'family', 'focus', 'free_food', 'free_lunch', 'injustice', 'mentortraditions', 'morals', 'oppress', 'party', 'persecution', 'politics', 'principles', 'stress', 'subjugation', 'tenseness', 'time_value', 'training', 'vibe']

career_opportunities: ['development', 'educate', 'groom', 'growth', 'learn', 'network', 'opportunities', 'program', 'promotion', 'publicity', 'skill', 'technology']

compensation_benefits: ['401k', 'allowan

In [2]:
# prompt: read this file /content/glassdoor_reviews_COPY_FULL.xlsx

import pandas as pd
import nltk
from nltk.corpus import wordnet
from gensim.models import KeyedVectors
import gensim.downloader as api
from collections import Counter

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:

import pandas as pd
df = pd.read_excel('/content/glassdoor_reviews_COPY_FULL.xlsx')
df.head()


Unnamed: 0,Review ID,Company Name,Overall Rating,Header,Job Title,Employment Status,Employment Tenure,Location,Recommend,CEO Approval,Business Outlook,Pros,Cons,Review Date
0,1,Deloitte,3,Good learning,Senior manager,Current employee,more than 10 years,Mumbai,Yes,Yes,Yes,Lot of learning and to work on latest technolo...,Work life fit and bureaucracy,"Oct 28, 2024"
1,2,Deloitte,3,Work environment,Senior tax manager,Current employee,more than 10 years,Portugal,No,No Data,No,Acess to information and a lot of learning cou...,Environment very competitive between colleague...,"Oct 28, 2024"
2,3,Deloitte,4,Good work culture,Analyst,Current employee,less than 1 year,Hyderābād,Yes,Yes,Yes,Flexi and nice people communication,Less salary and also growth is slow,"Oct 28, 2024"
3,4,Deloitte,4,Good place,Senior developer,Current employee,,Bengaluru,No Data,No Data,No Data,Good working culture in the office,Less salary and less increments,"Oct 28, 2024"
4,5,Deloitte,3,Life at deloitte,Tax consultant,Former employee,,Pune,No Data,No Data,No Data,"Great working environment, Good mentoring",Too many working hours and no extra payment,"Oct 28, 2024"


In [4]:

# Combine "Pros" and "Cons" with appropriate labels
df["review_text"] = (
     df["Pros"].fillna("").str.lower().str.replace("page", "", case=False).str.strip() +
    " cons: " + df["Cons"].fillna("").str.lower().str.replace("page", "", case=False).str.strip()
)

# Display the result
print(df[["review_text"]])

                                             review_text
0      lot of learning and to work on latest technolo...
1      acess to information and a lot of learning cou...
2      flexi and nice people communication cons: less...
3      good working culture in the office cons: less ...
4      great working environment, good mentoring cons...
...                                                  ...
99424  * opportunities to work with reputed clients. ...
99425  competitive environment great l&d process good...
99426  good team, excellent clients, interesting work...
99427  flexible timing & approach. employee friendly ...
99428  - professional working environment. - provided...

[99429 rows x 1 columns]


In [5]:
# Ensure all rows of the column are fully visible
pd.set_option('display.max_colwidth', None)

# Print the first 5 rows of the "review_text" column
print(df["review_text"].head(5))


0                        lot of learning and to work on latest technologies cons: work life fit and bureaucracy
1    acess to information and a lot of learning courses. cons: environment very competitive between colleagues.
2                                 flexi and nice people communication cons: less salary and also growth is slow
3                                      good working culture in the office cons: less salary and less increments
4                   great working environment, good mentoring cons: too many working hours and no extra payment
Name: review_text, dtype: object


In [6]:
df.head()

Unnamed: 0,Review ID,Company Name,Overall Rating,Header,Job Title,Employment Status,Employment Tenure,Location,Recommend,CEO Approval,Business Outlook,Pros,Cons,Review Date,review_text
0,1,Deloitte,3,Good learning,Senior manager,Current employee,more than 10 years,Mumbai,Yes,Yes,Yes,Lot of learning and to work on latest technologies,Work life fit and bureaucracy,"Oct 28, 2024",lot of learning and to work on latest technologies cons: work life fit and bureaucracy
1,2,Deloitte,3,Work environment,Senior tax manager,Current employee,more than 10 years,Portugal,No,No Data,No,Acess to information and a lot of learning courses.,Environment very competitive between colleagues.,"Oct 28, 2024",acess to information and a lot of learning courses. cons: environment very competitive between colleagues.
2,3,Deloitte,4,Good work culture,Analyst,Current employee,less than 1 year,Hyderābād,Yes,Yes,Yes,Flexi and nice people communication,Less salary and also growth is slow,"Oct 28, 2024",flexi and nice people communication cons: less salary and also growth is slow
3,4,Deloitte,4,Good place,Senior developer,Current employee,,Bengaluru,No Data,No Data,No Data,Good working culture in the office,Less salary and less increments,"Oct 28, 2024",good working culture in the office cons: less salary and less increments
4,5,Deloitte,3,Life at deloitte,Tax consultant,Former employee,,Pune,No Data,No Data,No Data,"Great working environment, Good mentoring",Too many working hours and no extra payment,"Oct 28, 2024","great working environment, good mentoring cons: too many working hours and no extra payment"


In [7]:
print(categories)

{'work_life_balance': ['household', 'schedule', 'honeymoon', 'private_life', 'fellowship', 'festive', 'vacation', 'pregnancy', 'weekend', 'work life', 'getaway', 'pto', 'home', 'holidays', 'paternity', 'flexible', 'overtime', 'power_takeoff', 'vacay', 'wlb', 'maternity', 'children', 'hybrid', 'work-life', 'motherhood', 'remote', 'wfh'], 'culture_values': ['free_lunch', 'coaching', 'morals', 'community', 'party', 'stress', 'ambiance', 'politics', 'oppress', 'focus', 'principles', 'time_value', 'bureaucracyatmoshpere', 'colleagues', 'mentortraditions', 'diversity', 'injustice', 'family', 'training', 'tenseness', 'environment', 'communication', 'subjugation', 'vibe', 'culture', 'civilisation', 'anxiety', 'persecution', 'free_food'], 'career_opportunities': ['publicity', 'groom', 'learn', 'development', 'promotion', 'technology', 'network', 'skill', 'opportunities', 'program', 'growth', 'educate'], 'compensation_benefits': ['layoff', 'give_the_axe', 'tuition', 'fund', '401k', 'compensation

In [8]:
# Replace underscores with spaces in categories for matching
categories = {
    key: [phrase.replace("_", " ") for phrase in phrases]
    for key, phrases in categories.items()
}
print(categories)

{'work_life_balance': ['household', 'schedule', 'honeymoon', 'private life', 'fellowship', 'festive', 'vacation', 'pregnancy', 'weekend', 'work life', 'getaway', 'pto', 'home', 'holidays', 'paternity', 'flexible', 'overtime', 'power takeoff', 'vacay', 'wlb', 'maternity', 'children', 'hybrid', 'work-life', 'motherhood', 'remote', 'wfh'], 'culture_values': ['free lunch', 'coaching', 'morals', 'community', 'party', 'stress', 'ambiance', 'politics', 'oppress', 'focus', 'principles', 'time value', 'bureaucracyatmoshpere', 'colleagues', 'mentortraditions', 'diversity', 'injustice', 'family', 'training', 'tenseness', 'environment', 'communication', 'subjugation', 'vibe', 'culture', 'civilisation', 'anxiety', 'persecution', 'free food'], 'career_opportunities': ['publicity', 'groom', 'learn', 'development', 'promotion', 'technology', 'network', 'skill', 'opportunities', 'program', 'growth', 'educate'], 'compensation_benefits': ['layoff', 'give the axe', 'tuition', 'fund', '401k', 'compensation

In [9]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [72]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def check_phrase_in_sentence(sentence, phrase):
    # Process the sentence and the phrase
    doc_sentence = nlp(sentence.lower())
    doc_phrase = nlp(phrase.lower())

    # Lemmatize both sentence and phrase words and check for semantic similarity
    sentence_lemmas = [token.lemma_ for token in doc_sentence]
    phrase_lemmas = [token.lemma_ for token in doc_phrase]

    # Check if the entire phrase exists in the lemmatized sentence as a sequence
    sentence_text = " ".join(sentence_lemmas)
    phrase_text = " ".join(phrase_lemmas)

    return phrase_text in sentence_text
    # Check if the lemmatized phrase words exist in the lemmatized sentence
    for word in phrase_lemmas:
        if word not in sentence_lemmas:
            return False  # Word not found
    return True  # All words found (in some form)


In [13]:
!pip install rapidfuzz
import pandas as pd
from transformers import pipeline
from rapidfuzz import fuzz, process

Collecting rapidfuzz
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/3.1 MB[0m [31m36.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m2.7/3.1 MB[0m [31m36.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.10.1


In [87]:
# Function to compute sentiment scores
def calculate_category_sentiment(review, category_keywords):
    """
    Calculate sentiment scores for a review for specific category keywords.
    Uses fuzzy matching to find relevant text fragments and RoBERTa for sentiment scoring.
    """
    scores = []
    pro_con = review.split("cons:", 1)

    for keyword in category_keywords:
        #for pros
        result = check_phrase_in_sentence(pro_con[0], keyword)
        if result:
            scores.append(1) #polarity)
            print(f"PROS: Phrase : {keyword}, sentence: {review}")
        else:
          scores.append(0)
        #for cons
        result = check_phrase_in_sentence(pro_con[1], keyword)
        if result:
            scores.append(-1) #polarity)
            print(f"CONS: Phrase : {keyword}, sentence: {review}")
        else:
          scores.append(0)




    return sum(scores) if scores else 0  # Return the total polarity score


# Apply the function to the DataFrame
def score_reviews(df, categories):
    for category, keywords in categories.items():
        df[category + "_sentiment"] = df["review_text"][:2000].apply(
            lambda x: calculate_category_sentiment(x.lower(), keywords)
        )
    return df



# Calculate sentiment scores for each category
df = score_reviews(df, categories)

# Display the DataFrame with added sentiment columns
pd.set_option('display.max_columns', None)  # Ensure all columns are visible
#print(df)

CONS: Phrase : work life, sentence: lot of learning and to work on latest technologies cons: work life fit and bureaucracy
PROS: Phrase : wlb, sentence: generally, good people to work with. better wlb than most consulting firms. "networking" based staffing model (can be a pro or a con) cons: lots of bureaucracy. strategy projects have poor work life balance.
CONS: Phrase : work life, sentence: generally, good people to work with. better wlb than most consulting firms. "networking" based staffing model (can be a pro or a con) cons: lots of bureaucracy. strategy projects have poor work life balance.
CONS: Phrase : overtime, sentence: loved working here learn a lot cons: not heinz paid for overtime
CONS: Phrase : home, sentence: well established organization prestigous work place cons: long hours overtimes are often work from home not for beginners
CONS: Phrase : overtime, sentence: well established organization prestigous work place cons: long hours overtimes are often work from home not

In [90]:
df.head()

Unnamed: 0,Review ID,Company Name,Overall Rating,Header,Job Title,Employment Status,Employment Tenure,Location,Recommend,CEO Approval,Business Outlook,Pros,Cons,Review Date,review_text,work_life_balance_sentiment,culture_values_sentiment,career_opportunities_sentiment,compensation_benefits_sentiment,senior_management_sentiment
0,1,Deloitte,3,Good learning,Senior manager,Current employee,more than 10 years,Mumbai,Yes,Yes,Yes,Lot of learning and to work on latest technologies,Work life fit and bureaucracy,"Oct 28, 2024",lot of learning and to work on latest technologies cons: work life fit and bureaucracy,-1.0,0.0,2.0,0.0,0.0
1,2,Deloitte,3,Work environment,Senior tax manager,Current employee,more than 10 years,Portugal,No,No Data,No,Acess to information and a lot of learning courses.,Environment very competitive between colleagues.,"Oct 28, 2024",acess to information and a lot of learning courses. cons: environment very competitive between colleagues.,0.0,-2.0,1.0,0.0,0.0
2,3,Deloitte,4,Good work culture,Analyst,Current employee,less than 1 year,Hyderābād,Yes,Yes,Yes,Flexi and nice people communication,Less salary and also growth is slow,"Oct 28, 2024",flexi and nice people communication cons: less salary and also growth is slow,0.0,1.0,-1.0,-1.0,0.0
3,4,Deloitte,4,Good place,Senior developer,Current employee,,Bengaluru,No Data,No Data,No Data,Good working culture in the office,Less salary and less increments,"Oct 28, 2024",good working culture in the office cons: less salary and less increments,0.0,1.0,0.0,-2.0,0.0
4,5,Deloitte,3,Life at deloitte,Tax consultant,Former employee,,Pune,No Data,No Data,No Data,"Great working environment, Good mentoring",Too many working hours and no extra payment,"Oct 28, 2024","great working environment, good mentoring cons: too many working hours and no extra payment",0.0,1.0,0.0,-1.0,0.0


In [94]:
display(df.iloc[1000:1200, [2,4,5,14,15,16,17,18,19]] )

Unnamed: 0,Overall Rating,Job Title,Employment Status,review_text,work_life_balance_sentiment,culture_values_sentiment,career_opportunities_sentiment,compensation_benefits_sentiment,senior_management_sentiment
1000,3,Sr consultant,Former employee,pay was at that time for a starter. started as a sr consultant because of my technical background. cons: have to be involved in company engagements. those engagements often have nothing to do with your role unless you are lucky. not the ideal for a high performer computer engineer who focuses on his career and bring value to his team.,0.0,-1.0,0.0,1.0,0.0
1001,4,Consultant,Current employee,lots of learning opportunities with different types of project across different industries. exposure to new technologies cons: there can be long hours at times.,0.0,0.0,3.0,0.0,0.0
1002,4,Manager,Former employee,work flexibility. work from home. cons: working hours - depending on project,1.0,0.0,0.0,0.0,0.0
1003,3,Audit senior,Former employee,"very flexible working hours, no controls on that, even working for up to 6 weeks abroad. big clients. weekly free drinks. cons: the culture in the amsterdam office wasn't the nicest. that's why i eventually quit. big differentiation between expats and locals, basically two own groups.",1.0,-1.0,0.0,0.0,0.0
1004,4,Senior project associate,Current employee,"good people, learning and development. cons: slow progress. low salary. promotion system can be improved",0.0,0.0,1.0,-1.0,0.0
...,...,...,...,...,...,...,...,...,...
1195,5,Senior consultant,Current employee,lots of opportunities for personal and professional development. cons: unconventional working hours to fulfill customers needs,0.0,0.0,2.0,0.0,0.0
1196,5,Financial analyst,Current employee,good working environment. flexible working. cons: nothing as such to discuss.,1.0,1.0,0.0,0.0,0.0
1197,5,Analyst,Current employee,good environment supportive leadership good to start career cons: repetitive work. not much exposure,0.0,1.0,0.0,0.0,1.0
1198,4,Senior accountant,Former employee,worked here for a few years after finishing college and learned a lot cons: you work crazy hours and will be up late,0.0,0.0,1.0,0.0,0.0


In [92]:
display(
    (df[
    (df['work_life_balance_sentiment'] == 0) &
    (df['culture_values_sentiment'] == 0) &
    (df['career_opportunities_sentiment'] == 0)&
    (df['compensation_benefits_sentiment'] == 0)&
    (df['senior_management_sentiment'] == 0)
 ]).iloc[:, [14,15,16,17,18,19]]
)


Unnamed: 0,review_text,work_life_balance_sentiment,culture_values_sentiment,career_opportunities_sentiment,compensation_benefits_sentiment,senior_management_sentiment
5,"interesting clients, exposure to top field experts and partners. cons: results driven and high attrition rate. long hours.",0.0,0.0,0.0,0.0,0.0
11,"reputable, big name, good place to grow cons: very fast paced, feels like you are just a number at times",0.0,0.0,0.0,0.0,0.0
16,great people to work with cons: the workload could get heavy at times,0.0,0.0,0.0,0.0,0.0
44,good company to start off your career after grad cons: expected to work for long hours,0.0,0.0,0.0,0.0,0.0
54,good place to work in portugal cons: nothing much to add here,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1957,good work place at deloitte cons: nothing to mention here about it.,0.0,0.0,0.0,0.0,0.0
1958,nice most of the time cons: not the biggest cons here,0.0,0.0,0.0,0.0,0.0
1979,"everything is in balance, group is supportive too cons: as an intern you are not response for advance parts 215",0.0,0.0,0.0,0.0,0.0
1980,good people to work with and was good experience cons: work was challenging but good expereince,0.0,0.0,0.0,0.0,0.0


In [93]:
mydf = df.iloc[:2000, [2,4,5,14,15,16,17,18,19]]
# Write DataFrame to an Excel file
mydf.to_excel("combined_scores_2k.xlsx", index=False)

In [10]:
import spacy
from multiprocessing import Pool

# Load spaCy model once and disable unnecessary components
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

mydf = df[:1000].copy()
# Pre-lemmatize keywords
def lemmatize_keywords(keywords):
    return [" ".join([token.lemma_ for token in nlp(keyword.lower())]) for keyword in keywords]

# Optimized check_phrase_in_sentence
def check_phrase_in_sentence(sentence, lemmatized_keywords):
    # Process the sentence
    sentence_lemmas = " ".join([token.lemma_ for token in nlp(sentence.lower())])
    # Check if any lemmatized keyword exists in the sentence
    return any(keyword in sentence_lemmas for keyword in lemmatized_keywords)

# Optimized calculate_category_sentiment
def calculate_category_sentiment(review, category_keywords):
    try:
        pros, cons = review.split("cons:", 1)
    except ValueError:
        pros, cons = review, ""

    # Pre-lemmatize keywords
    lemmatized_keywords = lemmatize_keywords(category_keywords)

    # Check keywords in pros and cons
    score = sum(1 for kw in lemmatized_keywords if check_phrase_in_sentence(pros, [kw]))
    score -= sum(1 for kw in lemmatized_keywords if check_phrase_in_sentence(cons, [kw]))

    return score

# Pre-lemmatize keywords for all categories
preprocessed_categories = {cat: lemmatize_keywords(keywords) for cat, keywords in categories.items()}

# Define a helper for multiprocessing
def process_row(row):
    review = row["review_text"]
    return {cat + "_sentiment": calculate_category_sentiment(review, keywords)
            for cat, keywords in preprocessed_categories.items()}

# Use multiprocessing to process rows
with Pool() as pool:
    sentiment_scores = pool.map(process_row, mydf[:1000].to_dict("records"))

# Add sentiment scores to the DataFrame
for cat in categories.keys():
    mydf[cat + "_sentiment"] = [scores[cat + "_sentiment"] for scores in sentiment_scores]


In [11]:

display(mydf.iloc[:120, [14,15,16,17,18,19]] )

Unnamed: 0,review_text,work_life_balance_sentiment,culture_values_sentiment,career_opportunities_sentiment,compensation_benefits_sentiment,senior_management_sentiment
0,lot of learning and to work on latest technologies cons: work life fit and bureaucracy,-1,0,2,0,0
1,acess to information and a lot of learning courses. cons: environment very competitive between colleagues.,0,-2,1,0,0
2,flexi and nice people communication cons: less salary and also growth is slow,0,1,-1,-1,0
3,good working culture in the office cons: less salary and less increments,0,1,0,-2,0
4,"great working environment, good mentoring cons: too many working hours and no extra payment",0,1,0,-1,0
...,...,...,...,...,...,...
115,great training great cfe resource cons: office politics and senior management playing favorites,0,0,0,0,-1
116,good benefits decent holidays better work cons: long working hours lot of pressure,1,0,0,1,0
117,all good in terms of work life balance cons: sometime need to do overtime due to us timing,0,0,0,0,0
118,very mental health and well-being focussed cons: a lot of working from home,-1,1,0,0,0


In [12]:
!pip install dask[complete]


Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[complete])
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
Collecting distributed==2024.10.0 (from dask[complete])
  Downloading distributed-2024.10.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sortedcontainers>=2.0.5 (from distributed==2024.10.0->dask[complete])
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting tblib>=1.6.0 (from distributed==2024.10.0->dask[complete])
  Downloading tblib-3.0.0-py3-none-any.whl.metadata (25 kB)
Collecting zict>=3.0.0 (from distributed==2024.10.0->dask[complete])
  Downloading zict-3.0.0-py2.py3-none-any.whl.metadata (899 bytes)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
Collecting dask-expr<1.2,>=1.1 (from dask

In [13]:
import spacy
import dask.dataframe as dd
import pandas as pd

mydf = df[:1000].copy()

# Load spaCy model once and disable unnecessary components
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Pre-lemmatize keywords
def lemmatize_keywords(keywords):
    return [" ".join([token.lemma_ for token in nlp(keyword.lower())]) for keyword in keywords]

# Optimized check_phrase_in_sentence
def check_phrase_in_sentence(sentence, lemmatized_keywords):
    # Process the sentence
    sentence_lemmas = " ".join([token.lemma_ for token in nlp(sentence.lower())])
    # Check if any lemmatized keyword exists in the sentence
    return any(keyword in sentence_lemmas for keyword in lemmatized_keywords)

# Optimized calculate_category_sentiment
def calculate_category_sentiment(review, lemmatized_keywords):
    try:
        pros, cons = review.split("cons:", 1)
    except ValueError:
        pros, cons = review, ""

    # Check keywords in pros and cons
    score = sum(1 for kw in lemmatized_keywords if check_phrase_in_sentence(pros, [kw]))
    score -= sum(1 for kw in lemmatized_keywords if check_phrase_in_sentence(cons, [kw]))

    return score

# Function to process a single review safely
def process_review(row, preprocessed_categories):
    if pd.isna(row["review_text"]):  # Skip missing values
        return {cat + "_sentiment": 0 for cat in preprocessed_categories.keys()}

    review = row["review_text"]
    return {
        cat + "_sentiment": calculate_category_sentiment(review, lemmatized_keywords)
        for cat, lemmatized_keywords in preprocessed_categories.items()
    }

# Main function to score reviews with Dask
def score_reviews_dask(df, categories):
    # Pre-lemmatize keywords for all categories
    preprocessed_categories = {cat: lemmatize_keywords(keywords) for cat, keywords in categories.items()}

    # Define a wrapper function to pass preprocessed categories
    def process_wrapper(row):
        return process_review(row, preprocessed_categories)

    # Convert to Dask DataFrame
    ddf = dd.from_pandas(df, npartitions=4)

    # Apply the processing function with explicit metadata
    meta = {cat + "_sentiment": "int" for cat in categories.keys()}
    results = ddf.apply(process_wrapper, axis=1, meta=meta)

    # Compute results to ensure all columns are present
    computed_results = results.compute()

    # Debugging: Print the columns of the computed results
    print(f"Computed columns: {computed_results}")
'''
    # Merge results back into the original DataFrame
    for category in categories.keys():
        if category + "_sentiment" in computed_results.columns:
            df[category + "_sentiment"] = computed_results[category + "_sentiment"]
        else:
            print(f"Warning: {category + '_sentiment'} column is missing in the results.")
'''

# Score reviews
score_reviews_dask(mydf, categories)



Computed columns: 0       {'work_life_balance_sentiment': -1, 'culture_values_sentiment': 0, 'career_opportunities_sentiment': 2, 'compensation_benefits_sentiment': 0, 'senior_management_sentiment': 0}
1       {'work_life_balance_sentiment': 0, 'culture_values_sentiment': -2, 'career_opportunities_sentiment': 1, 'compensation_benefits_sentiment': 0, 'senior_management_sentiment': 0}
2      {'work_life_balance_sentiment': 0, 'culture_values_sentiment': 1, 'career_opportunities_sentiment': -1, 'compensation_benefits_sentiment': -1, 'senior_management_sentiment': 0}
3       {'work_life_balance_sentiment': 0, 'culture_values_sentiment': 1, 'career_opportunities_sentiment': 0, 'compensation_benefits_sentiment': -2, 'senior_management_sentiment': 0}
4       {'work_life_balance_sentiment': 0, 'culture_values_sentiment': 1, 'career_opportunities_sentiment': 0, 'compensation_benefits_sentiment': -1, 'senior_management_sentiment': 0}
                                                              

In [39]:
mydf.columns


Index(['Review ID', 'Company Name', 'Overall Rating', 'Header', 'Job Title',
       'Employment Status', 'Employment Tenure', 'Location', 'Recommend',
       'CEO Approval', 'Business Outlook', 'Pros', 'Cons', 'Review Date',
       'review_text'],
      dtype='object')

In [23]:

display(mydf.iloc[:120, [14,15,16,17,18,19]] )

IndexError: positional indexers are out-of-bounds