In [58]:
import pandas as pd
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, Date, ForeignKey
import re

pd.set_option("display.max_columns", None)

df = pd.read_csv("glassdoor_dataset/glassdoor.csv")


# Filtering the main glassdoor.csv file, a lot of columns are HTML related elements that are not very relevant for an analytical purpose.

filtered_df = df.loc[
    :,
    [
        "gaTrackerData.jobTitle",
        "gaTrackerData.location",
        "gaTrackerData.locationType",
        "gaTrackerData.sector",
        "header.easyApply",
        "header.employerName",
        "header.jobTitle",
        "header.location",  # Location City (most of times)
        "header.posted",  # Date job was posted
        "header.rating",  # Company rating by employees
        "header.urgencyLabel",  # Misleading column name, it actually indicates the presence of the "2019 Glassdoor Best Place to Work" award on the job posting
        "header.payHigh",
        "header.payMed",
        "header.payLow",
        "job.description",
        "job.jobSource",
        "map.country",  # do something with country_names_2_digit_codes
        "map.employerName",
        "map.lat",  # 0 for NaN
        "map.lng",  # 0 for NaN
        "map.location",
        "overview.foundedYear",  # 0 for NaN
        "overview.hq",
        "overview.industry",
        "overview.revenue",
        "overview.sector",
        "overview.size",
        "overview.stock",
        "overview.type",
        "overview.description",
        "overview.mission",
        "overview.competitors",  # foreign key
        "rating.ceo.name",
        "rating.ceoApproval",  # <0 or NaN for missing data
        "rating.recommendToFriend",  # <0 or NaN for missing data
        "rating.starRating",
        "benefits.comments",  # foreign key
        "benefits.highlights",  # foreign key
        "reviews",  # foreign key
        "salary.salaries",  # foreign key
        "wwfu",  # foreign key
    ],
]


# Replace negative values with NaN
columns_to_replace_neg = [
    "header.rating",
    "rating.starRating",
    "rating.ceoApproval",
    "rating.recommendToFriend",
]
for column in columns_to_replace_neg:
    filtered_df[column] = filtered_df[column].apply(lambda x: np.nan if x < 0 else x)


# Replace zero values with NaN
columns_to_replace_zero = [
    "map.lat",
    "map.lng",
]
for column in columns_to_replace_zero:
    filtered_df[column] = filtered_df[column].apply(lambda x: np.nan if x == 0 else x)


# Replace outlier/erroneous low salaries values with NaN
columns_to_replace_outlier_salaries = [
    "header.payHigh",
    "header.payMed",
    "header.payLow",
    "salary.salaries",
]
for column in columns_to_replace_outlier_salaries:
    filtered_df[column] = filtered_df[column].apply(lambda x: np.nan if x < 10000 else x)
    

# Replace outlier/erroneous low salaries values with NaN
columns_to_replace_outlier_salaries = [
    "header.payHigh",
    "header.payMed",
    "header.payLow",
    "salary.salaries",
]
for column in columns_to_replace_outlier_salaries:
    filtered_df[column] = filtered_df[column].apply(lambda x: np.nan if x < 10000 else x)
    
# Replace outlier/erroneous years values with NaN
columns_to_replace_outlier_years = [
    "overview.foundedYear",
]
for column in columns_to_replace_outlier_years:
    filtered_df[column] = filtered_df[column].apply(lambda x: np.nan if x < 1000 else x)


# Remove only rows with all missing values
filtered_df.dropna(how='all', inplace=True)

# Remove duplicates
filtered_df.drop_duplicates(inplace=True)


def clean_job_description(text):
    
    if text is None or pd.isna(text):
        return text
    
    # Remove HTML tags
    text = re.sub('<[^>]+>', '', text)
    
    # Remove HTML entities
    text = re.sub('&\w+;|&#\d+;', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove unwanted characters
    text = re.sub('[^\w\s]+', ' ', text)
    
    # Remove extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Strip leading/trailing spaces
    text = text.strip()

    return text



filtered_df['job.description'] = filtered_df['job.description'].apply(clean_job_description)



print(f"dtypes : \n{filtered_df.dtypes}\n\n")
print(f"Non-NaN percentage : \n{(filtered_df.count()/ len(filtered_df) * 100).round(2)}%")
print("\n\n\n%%%%%%%%%%%%%%%%%%%%%%%%\n\n\n")
print(filtered_df.head(60))
print(filtered_df.columns)
print(f"There are {len(filtered_df.columns)} columns")
print(filtered_df.describe())

# print(filtered_df['job.description'][22])
# print(filtered_df['job.description'][23])


dtypes : 
gaTrackerData.jobTitle         object
gaTrackerData.location         object
gaTrackerData.locationType     object
gaTrackerData.sector           object
header.easyApply                 bool
header.employerName            object
header.jobTitle                object
header.location                object
header.posted                  object
header.rating                 float64
header.urgencyLabel            object
header.payHigh                float64
header.payMed                 float64
header.payLow                 float64
job.description                object
job.jobSource                  object
map.country                    object
map.employerName               object
map.lat                       float64
map.lng                       float64
map.location                   object
overview.foundedYear          float64
overview.hq                    object
overview.industry              object
overview.revenue               object
overview.sector                object
ov

In [62]:
print(filtered_df['job.description'][500])


free now encourages a strong team spirit and simple company culture our teams organise themselves in established agile working environments people at free now work flexibly and independently every single employee is a part of our success story and so open communication and a fair feedback culture come naturally to us we are passionate problem solvers who listen and understand before we act by working together with each other customers drivers and cities we make things better besides a lot of fun and potential to make a real difference you will have the opportunity to work with the best team in the world we know everybody says so but at free now it s true as of now we re looking for a motivated and experienced senior frontend engineer you will be part of a newly founded team of motivated developers in our development hub in barcelona in close collaboration with the development teams in our headquarters in hamburg you will work on our main products these will be your responsibilities and