In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("Employee_Compensation_Accurate.csv")

# 1️⃣ Missing values
print("Missing values:\n", df.isna().sum())

# 2️⃣ Negative and infinite values
num_cols = df.select_dtypes(include=[np.number]).columns
print("\nNegative values:\n", (df[num_cols] < 0).sum())
print("\nInfinite values:\n", np.isinf(df[num_cols]).sum())

# 3️⃣ Descriptive summary
print("\nStatistical summary:\n", df.describe())

# 4️⃣ Text columns: duplicates and consistency
text_cols = df.select_dtypes(include='object').columns
for col in text_cols:
    print(f"\n{col} → {df[col].nunique()} unique values")
    print(df[col].unique()[:10])  # check first 10 unique values

Missing values:
 Year                      0
Organization_Group        0
Department                0
Job_Family                0
Job                       0
Employment_Type           0
Salaries                  0
Overtime                  0
Other_Salaries            0
Total_Salary              0
Retirement                0
Health_and_Dental         0
Other_Benefits            0
Total_Benefits            0
Total_Compensation        0
Union                     0
Total_Pay                 0
Pay_to_Benefit_Ratio    116
Overtime_Ratio            0
Benefits_Percentage       0
Overtime_Percentage       0
dtype: int64

Negative values:
 Year                      0
Salaries                  0
Overtime                  0
Other_Salaries          128
Total_Salary              0
Retirement                0
Health_and_Dental         0
Other_Benefits          490
Total_Benefits            0
Total_Compensation        0
Total_Pay                 0
Pay_to_Benefit_Ratio      0
Overtime_Ratio            0

In [2]:
df["Other_Salaries"] = df["Other_Salaries"].abs()
df["Other_Benefits"] = df["Other_Benefits"].abs()

In [4]:
df["Pay_to_Benefit_Ratio"] = df["Pay_to_Benefit_Ratio"].replace([np.inf, -np.inf], np.nan)

df["Pay_to_Benefit_Ratio"] = df["Pay_to_Benefit_Ratio"].fillna(df["Pay_to_Benefit_Ratio"].median())

In [5]:
num_cols = [
    "Salaries","Overtime","Other_Salaries","Total_Salary",
    "Retirement","Health_and_Dental","Other_Benefits",
    "Total_Benefits","Total_Compensation"
]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

In [6]:
df.to_csv("Employee_Compensation_Final_For_Tableau.csv", index=False)