In [18]:
import os
import pandas as pd

from sklearn.preprocessing import StandardScaler

In [19]:
# Cut-off points of COVID-19
before_date = "2020-01-30"
after_date = "2021-08-23"

In [20]:
# Specify the path to the folder containing the data (relative)
path = os.path.join("..", "data")

# File containing the stress entities
txt_fpath = os.path.join(path, "extracted", "stress_entities.txt")

In [21]:
# Load the TXT file into a Python List object
stress_entities = []

with open(txt_fpath, "r") as file:
    text = file.read()

    text = text.replace("'", "")
    text = text.replace("[", "").replace("]", "")
    
    stress_entities = text.split(",")

In [22]:
# Reading reviews
fpath = os.path.join(path, "new_reviews.csv")
reviews = pd.read_csv(fpath, low_memory=False)

In [23]:
# Extract the target variable, y
# 1 if review includes stress mention otherwise 0
mask = reviews["summary"].str.contains("|".join(stress_entities), case=False)
reviews["has_stress"] = mask

reviews['has_stress'] = reviews['has_stress'].astype(int)
reviews[reviews["has_stress"] == 1][["has_stress", "summary"]].sample(5)

Unnamed: 0,has_stress,summary
3481596,1,Good environment makes one to work stress free
1227925,1,stressful
6799351,1,"Young aggressive sales force, long hours & hig..."
4691638,1,"Stressful, but Great for a First Job"
6134444,1,Flexible Hours with seasonal heavy workload


In [24]:
reviews["review_date_time"] = pd.to_datetime(reviews["review_date_time"])
reviews["year"] = reviews["review_date_time"].dt.year

In [25]:
# Create dummy variables for the cut-off points
reviews["DURING_COVID"] = ((reviews["review_date_time"] >= before_date) & (reviews["review_date_time"] <= after_date)).astype(int)
reviews["POST_COVID"] = (reviews["review_date_time"] > after_date).astype(int)

In [26]:
# Define the 6-month intervals
six_month_intervals = pd.date_range(start=before_date, end=after_date, freq='6M')

for i in range(len(six_month_intervals) - 1):
    start_period = six_month_intervals[i]
    end_period = six_month_intervals[i + 1]
    reviews[f"during_covid_{i+1}_6_months"] = ((reviews["review_date_time"] > start_period) & 
                                               (reviews["review_date_time"] <= end_period)).astype(int)

In [31]:
# One hot encoding
reviews = pd.get_dummies(reviews, columns=["employment_status"], drop_first=True)

In [32]:
# Extract year
reviews["review_date_time"] = pd.to_datetime(reviews["review_date_time"])
reviews["year"] = reviews["review_date_time"].dt.year

In [33]:
reviews.columns

Index(['review_date_time', 'rating_overall', 'rating_ceo',
       'rating_business_outlook', 'rating_work_life_balance',
       'rating_culture_and_values', 'rating_diversity_and_inclusion',
       'rating_senior_leadership', 'rating_recommend_to_friend',
       'rating_career_opportunities', 'rating_compensation_and_benefits',
       'is_current_job', 'length_of_employment', 'job_title', 'location',
       'pros', 'cons', 'summary', 'count_helpful', 'company_id', 'org_uuid',
       'amount_of_funding_rounds_until_now', 'total_funding_until_now',
       'date_founded_glassdoor', 'date_founded_crunchbase', 'has_stress',
       'year', 'DURING_COVID', 'POST_COVID', 'during_covid_1_6_months',
       'during_covid_2_6_months', 'during_covid_3_6_months',
       'employment_status_FREELANCE', 'employment_status_INTERN',
       'employment_status_PART_TIME', 'employment_status_REGULAR',
       'employment_status_RESERVE', 'employment_status_SELF_EMPLOY',
       'employment_status_TEMPORARY', 

In [34]:
labels = ["job_title", "location", "pros", "cons", "summary", "org_uuid", "date_founded_glassdoor", "date_founded_crunchbase"]
reviews.drop(labels=labels, axis=1, inplace=True)

In [35]:
labels = ["rating_ceo", "rating_business_outlook", "rating_culture_and_values", "rating_diversity_and_inclusion", "rating_senior_leadership", "rating_career_opportunities", "rating_compensation_and_benefits", "rating_recommend_to_friend"]
reviews.drop(labels=labels, axis=1, inplace=True)

In [36]:
reviews.columns

Index(['review_date_time', 'rating_overall', 'rating_work_life_balance',
       'is_current_job', 'length_of_employment', 'count_helpful', 'company_id',
       'amount_of_funding_rounds_until_now', 'total_funding_until_now',
       'has_stress', 'year', 'DURING_COVID', 'POST_COVID',
       'during_covid_1_6_months', 'during_covid_2_6_months',
       'during_covid_3_6_months', 'employment_status_FREELANCE',
       'employment_status_INTERN', 'employment_status_PART_TIME',
       'employment_status_REGULAR', 'employment_status_RESERVE',
       'employment_status_SELF_EMPLOY', 'employment_status_TEMPORARY',
       'employment_status_UNKNOWN'],
      dtype='object')

In [37]:
# Scaling the data
scaler = StandardScaler()

# List of columns to scale
cols_to_scale = ["rating_overall", "rating_work_life_balance", "length_of_employment",
                 "count_helpful", "amount_of_funding_rounds_until_now", 
                 "total_funding_until_now"]

# Scale selected columns
reviews[cols_to_scale] = scaler.fit_transform(reviews[cols_to_scale])


In [38]:
reviews.to_csv("../data/stata_v1.csv", index=False)