In [16]:
import pandas as pd
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, Date, ForeignKey


pd.set_option("display.max_columns", None)

df = pd.read_csv("glassdoor_dataset/glassdoor.csv")


# Filtering the main glassdoor.csv file, a lot of columns are HTML related elements that are not very relevant for an analytical purpose.

filtered_df = df.loc[
    :,
    [
        "gaTrackerData.jobTitle",
        "gaTrackerData.location",
        "gaTrackerData.locationType",
        "gaTrackerData.sector",
        "header.easyApply",
        "header.employerName",
        "header.jobTitle",
        "header.location",  # Location City (most of times)
        "header.posted",  # Date job was posted
        "header.rating",  # Company rating by employees
        "header.urgencyLabel",  # Misleading column name, it actually indicates the presence of the "2019 Glassdoor Best Place to Work" award on the job posting
        "header.payHigh",
        "header.payMed",
        "header.payLow",
        "job.description",
        "job.jobSource",
        "map.country",  # do something with country_names_2_digit_codes
        "map.employerName",
        "map.lat",  # 0 for NaN
        "map.lng",  # 0 for NaN
        "map.location",
        "overview.foundedYear",  # 0 for NaN
        "overview.hq",
        "overview.industry",
        "overview.revenue",
        "overview.sector",
        "overview.size",
        "overview.stock",
        "overview.type",
        "overview.description",
        "overview.mission",
        "overview.competitors",  # foreign key
        "rating.ceo.name",
        "rating.ceoApproval",  # <0 or NaN for missing data
        "rating.recommendToFriend",  # <0 or NaN for missing data
        "rating.starRating",
        "benefits.comments",  # foreign key
        "benefits.highlights",  # foreign key
        "reviews",  # foreign key
        "salary.salaries",  # foreign key
        "wwfu",  # foreign key
    ],
]


# Replace negative values with NaN
columns_to_replace_neg = [
    "header.rating",
    "rating.ceoApproval",
    "rating.recommendToFriend",
]

for column in columns_to_replace_neg:
    filtered_df[column] = filtered_df[column].apply(lambda x: np.nan if x < 0 else x)


# Replace zero values with NaN
columns_to_replace_zero = [
    "map.lat",
    "map.lng",
    "overview.foundedYear",
]

for column in columns_to_replace_zero:
    filtered_df[column] = filtered_df[column].apply(lambda x: np.nan if x == 0 else x)


print(filtered_df.head(60))
print(filtered_df.columns)
print(f"There are {len(filtered_df.columns)} columns")
print(filtered_df.describe())

                               gaTrackerData.jobTitle  \
0                  Biogas Project Development Manager   
1   Quality Manager - Boehringer Ingelheim Healthc...   
2      Senior Software Engineer (PHP, Elixir, Python)   
3                                 Senior SQA Engineer   
4                                    Research Manager   
5                             PS Technical Consultant   
6                                 B2B Product Manager   
7   Post-Doctoral Research Visit F/M Microscopic d...   
8   Technical Consultant for Microsoft Infrastruct...   
9                              ETL/Database Developer   
10  Digital Product Manager, Digital Customer Onbo...   
11                                    Program Manager   
12            Key Account Manager - Private Hospitals   
13                                   Big Data Analyst   
14                            Partner Success Manager   
15  Business Analyst IFRS 17 Implementation Paris F/H   
16                             