In [None]:
import os
import re
import gc  # Garbage Collector interface
import math
import nltk
import spacy
import numpy as np
import pandas as pd
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')

#### Establishment of Workforce Training Vocabulary Set

In [None]:
# Specify the folder path containing your text files
folder_path = "E:\\workforce\\workforce_training_ds"

# Initialize a dictionary to store filtered words for each file
filtered_words_by_file = {}

# Define a list of English letters
eng_l = ['a', 'A', 'b', 'B', 'c', 'C', 'd', 'D', 'e', 'E', 'f', 'F', 'g', 'G', 'h', 'H', 'i', 'I', 'j', 'J', 'k', 'K', 'l', 'L', 'm', 'M', 'n', 'N', 'o', 'O', 'p', 'P', 'q', 'Q', 'r', 'R', 's', 'S', 't', 'T', 'u', 'U', 'v', 'V', 'w', 'W', 'x', 'X', 'y', 'Y', 'z', 'Z']

# Iterate through the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        # Open and read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Tokenize and preprocess the text
        words = word_tokenize(text.lower())

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word not in stop_words]
        print(len(filtered_words))
        # Create a list to store meaningful words
        wanted = []
        for word in filtered_words:
            if word[0] in eng_l:
                wanted.append(word)
        
        # Store the filtered words for the current file in the dictionary
        filtered_words_by_file[filename] = wanted

# Now you have a dictionary where each key is a filename, and the value is a list of filtered words for that file
for filename, words in filtered_words_by_file.items():
    print(f"File: {filename}")

In [None]:
# Dictionary holding the specific number of words you want for each file
top_n_by_file = {
    'eco1.txt': 30000,
    'eco2.txt': 30000,
    'eco3.txt': 30000,
    'eco4.txt': 30000,
    'eco5.txt': 30000
}

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Initialize a dictionary to store TF-IDF results for each file
tfidf_results = {}
words_with_positive_tfidf = {}
all_words_results = pd.DataFrame()

for filename, words in filtered_words_by_file.items():
    # Combine the filtered words into a single string
    document = " ".join(words)

    # Fit and transform the TF-IDF vectorizer on the document
    tfidf_matrix = tfidf_vectorizer.fit_transform([document])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    # Sort words by their TF-IDF scores
    sorted_tfidf_df = tfidf_df.T.sort_values(by=0, ascending=False)

    # Store the sorted TF-IDF DataFrame in the results dictionary
    tfidf_results[filename] = sorted_tfidf_df

    # Print the top N words with the highest TF-IDF scores
    top_n = top_n_by_file.get(filename, 2000)
    print(f"Top {top_n} words in {filename}:")
    print(sorted_tfidf_df.head(top_n))

    # Retrieve the top words
    top_words = sorted_tfidf_df.head(top_n).reset_index()['index']  # Only retrieve the words, not the scores
    
    # Convert the series to a DataFrame
    top_words_df = top_words.to_frame(name='Word')

    # Append these words in the all_words_results DataFrame
    all_words_results = all_words_results.append(top_words_df, ignore_index=True)

#### Approach 01

`The code uses spaCy to process the tokens and identify the positions of any synonyms. It then counts the labor-related words that appear within 10 words of any synonym, and calculates the labor risk indicator as the ratio of matched labor words to the total word count in the file.`

In [None]:
# Load the political bigrams directly into a set
with open('labor_list_words.csv', 'r', encoding='utf-8') as file:
    labor_word_set = set(file.read().splitlines())

# Load synonyms
with open('synonyms.txt', 'r', encoding='utf-8') as file:
    synonyms = file.read().splitlines()

# Load the spaCy model with disabled components for efficiency
nlp = spacy.load('en_core_web_sm', disable=["tagger", "ner", "parser"])

def compute_prisk_for_file(filepath, synonyms, labor_word_set):
    # Initialize the counter for matched words
    matched_words_count = 0
    total_word_count = 0

    with open(filepath, 'r', encoding='utf-8') as file:
        # Process the file line by line
        for line in file:
            # Clean the text by removing punctuation
            clean_line = re.sub(r'[^\w\s]', '', line)

            # Tokenize the cleaned line
            words = clean_line.split()
            total_word_count += len(words)

            # Process the tokens using spaCy
            doc = nlp(clean_line)

            # Find the positions of synonyms in the tokens
            positions = [token.i for token in doc if token.text in synonyms]

            # Match the labor words in proximity to the synonyms
            matched_words_count += sum(1 for i, word in enumerate(words) if word in labor_word_set and any(abs(i - pos) < 10 for pos in positions))

    # Clear the memory for unreferenced objects
    gc.collect()

    # Calculate the risk indicator
    if total_word_count == 0:
        return 0
    else:
        return matched_words_count / total_word_count

def process_year(year, root, synonyms, labor_word_set):
    # Define the path to the transcripts based on the year
    transcripts_path = os.path.join(root, str(year))
    results = []

    # Process each transcript file
    for filename in os.listdir(transcripts_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(transcripts_path, filename)
            Lrisk = compute_prisk_for_file(filepath, synonyms, labor_word_set)
            results.append((filename, Lrisk))
            print(f"Filename: {filename}")
            print(f"Lrisk: {Lrisk}")
            print("--------")

    # Create a DataFrame from the results and save it as a CSV file
    df = pd.DataFrame(results, columns=['Filename', 'Lrisk'])
    df.to_csv(f"Lrisk_{year}.csv", index=False)
    print(f"Results for {year} saved to Lrisk_{year}.csv")

# Set the root directory for the transcripts
root = 'E:\\workforce\\'

# Process the transcripts for each year
for year in range(2001, 2024):
    process_year(year, root, synonyms, labor_word_set)

#### Approach 02

#### Based on [Deloitte's labor-related keywords](https://action.deloitte.com/insight/3087/what-is-workforce-risk-its-broader-than-you-think), vectorize the original labor_list_words and calculate cosine similarity, categorizing them into highly relevant (score 5) and generally relevant (score 3).

`1. exact_match_threshold and partial_match_threshold: These are similarity thresholds used to classify words. If the similarity between two words is higher than the exact_match_threshold, the word is classified as a 5. If the similarity falls between the partial_match_threshold and the exact_match_threshold, it is classified as a 3. Otherwise, it is scored 0.`

`2. Classifying words: Using a for loop to iterate through each word in words_to_classify, the similarity with all words in labor_words is calculated. Based on the similarity values, words are classified into different scores. The word and its score are then added to the classified_words list.`

`3. classified_words_df: Convert classified_words into a DataFrame and name the columns 'Word' and 'Score' for better visualization and analysis.`

`4. score_distribution: Analyze the score distribution of the classified words by calculating the number of words for each score.`

In [None]:
# labor words:deloitte
labor_words = [
    "skills", "talent", "availability", "ability", "address", "changing", 
    "workforce", "expectations", "disruptions", "location", 
    "workforce-related", "regulations", "compliance", "amplified", "voice", 
    "individual", "workers", "esg", "diversity,", "equity,", "inclusion", 
    "workforce", "trust,", "purpose,", "mission", "ability", "plan", "deploy", 
    "workers", "evolving", "organizational", "needs", "well-being,", 
    "compensation,", "rewards", "data", "technology", "responsible", "use", 
    "workforce", "artificial", "intelligence,", "cybersecurity,", "data", 
    "privacy", "industries", "industry", "unemployed", "layoffs", "bureau", 
    "workers", "employees", "employee", "payroll", "unemployment", "employer", 
    "quit", "work", "opportunities", "occupations", "employing", "jobseekers",
    "labor", "openings", "rate", "hires", "job", "employment", "jobs","abilities","earnings","employers","employ","employability","employable","employed","employs","expects","expecting",'expected',"expectancy","expectancies","expectations","expectation","hire","hired",
    "hiring","include","included","includes","including","inclusive","individuals","industrial","intelligent","intellectual","intelligences","jobs","jobsecurity","jobseeker","jobseeking","labour","labors","labourmarket","layoff","wages","wage",
    "workplace","workplaces","works","worksite","worksites","workstation","workstations"
]

# Load the uploaded CSV file
file_path = "E:\\workforce\\labor_list_words.csv"
labor_list_words_df = pd.read_csv(file_path)
labor_list_words_df = labor_list_words_df.drop_duplicates(keep='first')

In [None]:
# Extract words from the uploaded file
words_to_classify = labor_list_words_df['Word'].tolist()

# Combine both lists for vectorization
combined_words = labor_words + words_to_classify

# Vectorize the words using CountVectorizer
vectorizer = CountVectorizer().fit(combined_words)
word_vectors = vectorizer.transform(combined_words)

# Calculate cosine similarity between each word in labor_words and words_to_classify
similarity_matrix = cosine_similarity(word_vectors[:len(labor_words)], word_vectors[len(labor_words):])

# Define the thresholds for classification
exact_match_threshold = 0.8  # High similarity
partial_match_threshold = 0.4 # Moderate similarity

# Classify the words
classified_words = []
for i, word in enumerate(words_to_classify):
    max_similarity = np.max(similarity_matrix[:, i])
    if max_similarity >= exact_match_threshold:
        score = 5
    elif max_similarity >= partial_match_threshold:
        score = 3
    else:
        score = 0
    classified_words.append((word, score))

# Convert the results to a DataFrame for better visualization
classified_words_df = pd.DataFrame(classified_words, columns=['Word', 'Score'])

# Analyzing the distribution of scores in the adjusted classification
score_distribution = classified_words_df['Score'].value_counts().sort_index()

# Display the score distribution
score_distribution

`1. Create an indirect word list comprising words not categorized as highly or generally relevant, indicating low relevance or irrelevance, derived from the labor_word_list.`

`2. Vectorize these indirect words and calculate cosine similarity, applying the same logic as above to categorize them into scores of 0 and 1 based on their similarity scores.`

In [None]:
# Provided list of indirect words
indirect_words = [
    "jolts", "products", "manufacturing", "goods", "percent", "estimates", "trends", "united",
    "seasonally", "adjusted", "fill", "level", "market", "durable", "average", "series",
    "nondurable", "total", "persons", "experienced", "discharges", "turnover", "recession", "annual",
    "million", "levels", "demand", "peak", "number", "statistics", "interactive", "categories",
    "values", "ratio", "system", "thousands", "business", "regional", "region", "people",
    "machinery", "equipment", "information", "growth", "released", "cyclical", "services", "component",
    "measures", "nation", "supply", "economies", "decisions", "structure","development","develops","device","devices","devise","dismal","dismiss","dismissal","dismissals","dismissed","dismissing",
    "disrupted","disrupting","disruption","disruptions","disruptive","disrupts","economic","economically","economical","economics","economist","economy","eliminate","eliminated","eliminating","elimination","entrepreneur","entrepreneurial","entrepreneurs","entrepreneurship","estimate","estimated","estimating","estimation","estimators","experience","experiences","experiencing","experiential","experiment","experiments","filled","filling","grow","growing","grown","grows",
    "informative","leverage","leveraged","leverages","leveraging","machine","machines","markets","measured","measurement","measurements",
    "warehouse","warehouses","welfare"
]

# Filter out words from classified_words_df where score is not 0
words_to_reclassify = classified_words_df[classified_words_df['Score'] == 0]['Word'].tolist()

# Combine indirect words with words to reclassify for vectorization
combined_words_for_reclassification = indirect_words + words_to_reclassify

# Vectorize the words using CountVectorizer
vectorizer_reclassify = CountVectorizer().fit(combined_words_for_reclassification)
word_vectors_reclassify = vectorizer_reclassify.transform(combined_words_for_reclassification)

# Calculate cosine similarity between each word in indirect_words and words_to_reclassify
similarity_matrix_reclassify = cosine_similarity(word_vectors_reclassify[:len(indirect_words)], word_vectors_reclassify[len(indirect_words):])

# Define the thresholds for reclassification
indirect_match_threshold = 0.8  # High similarity for indirect match
no_relevance_threshold = 0.4    # Lower bound for no relevance

# Reclassify the words
for i, word in enumerate(words_to_reclassify):
    max_similarity = np.max(similarity_matrix_reclassify[:, i])
    if max_similarity >= indirect_match_threshold:
        score = 1  # Indirect Match
    elif max_similarity < no_relevance_threshold:
        score = 0  # No Relevance
    else:
        # If a word had a score of 0 and doesn't meet the criteria for indirect match or no relevance,
        # keep its original score (which is 0)
        continue

    # Update the score in the original dataframe
    classified_words_df.loc[classified_words_df['Word'] == word, 'Score'] = score

# Display the updated score distribution
updated_score_distribution = classified_words_df['Score'].value_counts().sort_index()
updated_score_distribution

In [None]:
# Load the labor words with their scores into a dictionary
with open('deloitte.csv', 'r', encoding='utf-8') as file:
    labor_word_dict = {line.split(',')[0]: int(line.split(',')[1].strip()) for line in file.readlines()[1:]}

# Load synonyms
with open('synonyms.txt', 'r', encoding='utf-8') as file:
    synonyms = file.read().splitlines()
synonyms = list(dict.fromkeys(synonyms))

# Load the spaCy model with disabled components for efficiency
nlp = spacy.load('en_core_web_sm', disable=["tagger", "ner", "parser"])

def compute_prisk_for_file(filepath, synonyms, labor_word_dict):
    # Initialize the counter for matched words
    matched_words_score = 0
    total_word_count = 0

    with open(filepath, 'r', encoding='utf-8') as file:
        # Process the file line by line
        for line in file:
            # Clean the text by removing punctuation
            clean_line = re.sub(r'[^\w\s]', '', line)

            # Tokenize the cleaned line
            words = clean_line.split()
            total_word_count += len(words)

            # Process the tokens using spaCy
            doc = nlp(clean_line)

            # Find the positions of synonyms in the tokens
            positions = [token.i for token in doc if token.text in synonyms]

            # Match the labor words in proximity to the synonyms and add their scores
            for i, word in enumerate(words):
                if word in labor_word_dict and any(abs(i - pos) < 10 for pos in positions):
                    matched_words_score += labor_word_dict[word]

    # Clear the memory for unreferenced objects
    gc.collect()

    # Calculate the risk indicator
    if total_word_count == 0:
        return 0
    else:
        return matched_words_score / total_word_count

def process_year(year, root, synonyms, labor_word_dict):
    # Define the path to the transcripts based on the year
    transcripts_path = os.path.join(root, str(year))
    results = []

    # Process each transcript file
    for filename in os.listdir(transcripts_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(transcripts_path, filename)
            Lrisk = compute_prisk_for_file(filepath, synonyms, labor_word_dict)
            results.append((filename, Lrisk))
            print(f"Filename: {filename}")
            print(f"Lrisk: {Lrisk}")
            print("--------")

    # Create a DataFrame from the results and save it as a CSV file
    df = pd.DataFrame(results, columns=['Filename', 'Lrisk'])
    df.to_csv(f"Lrisk_{year}.csv", index=False)
    print(f"Results for {year} saved to Lrisk_{year}.csv")

# Set the root directory for the transcripts
root = 'E:\\workforce\\'

# Process the transcripts for each year
for year in range(2001, 2024):
    process_year(year, root, synonyms, labor_word_dict)