In [None]:
# Install kagglehub if you need it, just comment out the next line
# !pip install kagglehub

In [None]:
import kagglehub

# Download the Glassdoor Job Reviews dataset from Kaggle
path = kagglehub.dataset_download("davidgauthier/glassdoor-job-reviews-2")

print("Path to dataset files:", path)

In [None]:
# Convert the csv at the variable "path" to a pd dataframe
import pandas as pd
import os

jobs_org_df = pd.read_csv(os.path.join(path, "all_reviews.csv"))

In [None]:
jobs_org_df.head()

In [None]:
# Function to extract and format the firm name
def extract_firm_name(link):
    # Split string on slashes
    parts = link.split("/")
    # Extract the last part of the string (the firm name)
    firm_name = parts[-1]
    # Split file name on dashes
    firm_name_parts = firm_name.split("-")
    # Remove the last 2 parts (the file extension) and join the rest with spaces
    firm_name = " ".join(firm_name_parts[:-2])
    # Return the formatted firm name
    return firm_name

# Test the function with different types of links    
print(f"Extract firm from relative path: {extract_firm_name('Reviews/Baja-Steel-and-Fence-Reviews-E5462645.htm')}")
print(f"Extract firm from absolute path: {extract_firm_name('https://www.glassdoor.com/Reviews/Calgary-Flames-Reviews-E5247.htm')}")


In [None]:
# Transform the data set to extract the firm name from the link
jobs_org_df["firm_name"] = jobs_org_df["firm_link"].apply(extract_firm_name)



In [None]:
# Print Unique firm names
print(f"\nUnique firm names: {jobs_org_df['firm_name'].unique()}")
# Print the number of unique firm names
print(f"\nNumber of unique firm names: {jobs_org_df['firm_name'].nunique()}")
# Print the number of reviews per firm
print(f"\nNumber of reviews per firm: {jobs_org_df['firm_name'].value_counts()}")
# Get the count of total reviews
print(f"\nTotal reviews: {jobs_org_df['firm_name'].count()}")

In [None]:
# Drop any rows where the count of the firm name is less than 1,000
jobs_df = jobs_org_df[jobs_org_df["firm_name"].map(jobs_org_df["firm_name"].value_counts()) >= 1000]

In [None]:
# Print Unique firm names
print(f"\nUnique firm names: {jobs_df['firm_name'].unique()}")
# Print the number of unique firm names
print(f"\nNumber of unique firm names: {jobs_df['firm_name'].nunique()}")
# Print the number of reviews per firm
print(f"\nNumber of reviews per firm: {jobs_df['firm_name'].value_counts()}")
# Get the count of total reviews
print(f"\nTotal reviews: {jobs_df['firm_name'].count()}")

In [None]:
# Count the number of null or NaN entries for each column
print(f"\nCount of null or NaN entries for each column:\n{jobs_df.isnull().sum()}")


In [None]:
# Let's drop the columns that have almost all NULL values
jobs_df = jobs_df.drop(columns=["advice", "index"])

In [None]:
# Drop any rows with null or NaN entries in any column
jobs_df = jobs_df.dropna()

# Count the number of null or NaN entries for each column
print(f"\nCount of null or NaN entries for each column:\n{jobs_df.isnull().sum()}")

In [None]:
# Get counts of each unique values in the "status" column
print(f"\nCounts of each unique value in the 'status' column:\n{jobs_df['status'].value_counts()}")


In [None]:
print("AFTER DATA CLEANING:")
# Print Unique firm names
print(f"\nUnique firm names: {jobs_df['firm_name'].unique()}")
# Print the number of unique firm names
print(f"\nNumber of unique firm names: {jobs_df['firm_name'].nunique()}")
# Print the number of reviews per firm
print(f"\nNumber of reviews per firm: {jobs_df['firm_name'].value_counts()}")
# Get the count of total reviews
print(f"\nTotal reviews: {jobs_df['firm_name'].count()}")

In [None]:
# JONATHAN - STOP HERE! UNCOMMENT THE SAVE TO CSV LINE
# Export the cleaned dataset to a CSV file
# jobs_df.to_csv("Resources/cleaned_glassdoor_reviews_large.csv", index=False)

In [None]:
# Get the size of the jobs_df dataframe
total_rows = jobs_df.shape[0]
rows_step = 250000
start_index = 0
step_number = 0

# Loop through the dataframe in chunks of 250,000 rows
while start_index < total_rows:
    end_index = min(start_index + rows_step, total_rows)
    chunk_df = jobs_df.iloc[start_index:end_index]
    
    # Save the chunk to a CSV file
    chunk_df.to_csv(f"Resources/chunked/cleaned_glassdoor_reviews_large_{start_index}.csv", index=False)
    
    # Print Progress
    print(f"Saved chunk {step_number} from index {start_index} to {end_index}")
    print(f"Chunk {step_number} shape: {chunk_df.shape}")
    
    # Update the start index for the next chunk
    start_index += rows_step

    # Increase the step number for the next file name
    step_number += 1
    

In [None]:
# Get a sample of 500,000 reviews
jobs_sample_df = jobs_df.sample(n=500000, random_state=1)


In [None]:
print("AFTER DATA CLEANING:")
# Print Unique firm names
print(f"\nUnique firm names: {jobs_sample_df['firm_name'].unique()}")
# Print the number of unique firm names
print(f"\nNumber of unique firm names: {jobs_sample_df['firm_name'].nunique()}")
# Print the number of reviews per firm
print(f"\nNumber of reviews per firm: {jobs_sample_df['firm_name'].value_counts()}")
# Get the count of total reviews
print(f"\nTotal reviews: {jobs_sample_df['firm_name'].count()}")

In [None]:
# Drop any rows where the count of the firm name is less than 1,000
jobs_sample_reduced_df = jobs_sample_df[jobs_sample_df["firm_name"].map(jobs_sample_df["firm_name"].value_counts()) >= 600]

In [None]:
print("AFTER DATA CLEANING:")
# Print Unique firm names
print(f"\nUnique firm names: {jobs_sample_reduced_df['firm_name'].unique()}")
# Print the number of unique firm names
print(f"\nNumber of unique firm names: {jobs_sample_reduced_df['firm_name'].nunique()}")
# Print the number of reviews per firm
print(f"\nNumber of reviews per firm: {jobs_sample_reduced_df['firm_name'].value_counts()}")
# Get the count of total reviews
print(f"\nTotal reviews: {jobs_sample_reduced_df['firm_name'].count()}")

In [None]:
# Export the cleaned dataset to a CSV file
jobs_sample_reduced_df.to_csv("Resources/cleaned_glassdoor_reviews.csv", index=False)