## 1. ACCESSING DATA

In [15]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer



#pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [5]:
df = pd.read_csv('hr_raw_data_v1.csv')

In [7]:
df.head(1)


Unnamed: 0,employee_number,gender,birth_year,age,marital_status,dist_home,job_title,department,terminated,year_at_comp,year_current_role,standard_hours,remote,business_travel,over_time,job_level,stock_opt_level,traning_times_last_year,perf_rate,year_last_promotion,year_current_mngr,education_field,education_scale,annual_salary,monthly_income,daily_rate,hourly_rate,monthly_rate,perc_salary_hike,env_sat_rate,job_involvement,job_sat_rate,relationship_sat_rate,work_life_balance,num_comp_worked,tot_working_year
0,1,0,1972,51,,6,resEArch DIREcToR,,No,20,,Full Time,Yes,,No,5,0,5,30,15,15,,3,"195370,00$","16280,83$",2015.722222,,"42330,17$",13,1,3,3,3,30,7,


## 2. DATA CLEANING

In [12]:
# Values format to title and strip

to_title = ['marital_status', 'job_title',  'department', 'education_field']

df[to_title] = df[to_title].apply(lambda x: x.str.title().str.strip() if x.dtype == 'object' else x)

In [8]:
# Marital status typos fixing

df["marital_status"]=df["marital_status"].replace("Marreid", "Married")

In [13]:
# Converting obj to float

convert_float = ['annual_salary','monthly_income','daily_rate']
def replace_currency(value):
        try:
                value = float(value.replace("$", "").replace(",", "."))
                return value
        except:
            return value



for col in convert_float:
     df[col]=df[col].apply(replace_currency)

In [10]:
# Replacing values obj to numeric in age

age_update = {
    'thirty-two': 32,
    'twenty-four': 24,
    'thirty': 30,
    'fifty-eight': 58,
    'fifty-two': 52,
    'twenty-six': 26,
    'fifty-five': 55,
    'thirty-seven': 37,
    'thirty-six': 36,
    'forty-seven': 47,
    'thirty-one': 31}

df['age'] = df['age'].replace(age_update)

In [None]:
# Gender update, where 0 is male and 1 is female
#gender_map = {0: "M", 1:"F"}
#df['gender'] = df['gender'].map(gender_map)

## 3. NULLS MANAGEMENT

In [20]:
# RELATED SALARY COLUMNS IMPUTATION

salary_base =['monthly_income', 'daily_rate', 'annual_salary']

imputer = KNNImputer(n_neighbors=3)
df[salary_base] = imputer.fit_transform(df[salary_base])

In [19]:
# CONFIRMATION: RELATED SALARY COLUMNS IMPUTATION 
df[salary_base].isnull().sum()

monthly_income    0
daily_rate        0
annual_salary     0
dtype: int64

In [None]:
departments_dict =  {"Research & Development": ["Healthcare Representative", "Laboratory Technician", "Manufacturing Director", "Research Scientist", "Research Director"],
                    "Sales": ["Sales Executive", "Sales Representative"],
                    "Human Resources": ["Human Resources"]}

df['department'] = df['department'].fillna(df['job_title'].map(departments_dict))



In [None]:
departments_dict = {
    "Research & Development": ["Healthcare Representative", "Laboratory Technician", "Manufacturing Director", "Research Scientist", "Research Director"],
    "Sales": ["Sales Executive", "Sales Representative"],
    "Human Resources": ["Human Resources"]
}

# Reverse the dictionary to map job titles to departments
job_to_department = {job: dept for dept, jobs in departments_dict.items() for job in jobs}


def map_department(job_title):
    try:
        return job_to_department[job_title]
    except:
        return None 

# Apply the function to fill missing department values
df['department'] = df['department'].fillna(df['job_title'].apply(map_department))



Updated DataFrame:
      employee_number  gender  birth_year age marital_status  dist_home  \
0                   1       0        1972  51            NaN          6   
1                   2       0        1971  52            NaN          1   
2                   3       0        1981  42        Married          4   
3                   4       1        1976  47        Married          2   
4                   5       1        1977  46       Divorced          3   
...               ...     ...         ...  ..            ...        ...   
1673              824       1        1980  43         Single        -26   
1674             1087       1        1976  47        Married         26   
1675              528       0        1994  29            NaN         15   
1676               76       1        1976  47       Divorced          4   
1677              401       1        1991  32         Single          2   

                   job_title              department terminated  year_at_comp  \

In [28]:
df['department'].isna().sum()

95

## EXPORTING NEW DATA

In [None]:
df.to_csv("hr_raw_data_v1.csv", index=False)
