#### IMPORTS

In [1]:
import pandas as pd
import numpy as np
import re


#pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('../files/hr_raw_data.csv', index_col=0)


In [3]:
df.head(3)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30.0,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes
1,52,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,"43331,17$",0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,,1971,"199990,00$",,,1
2,42,No,travel_rarely,1984.253968,Research & Development,4,2,Technical Degree,1,3,3,0,,3,5,ManaGER,4,Married,,"41669,33$",1,,No,11,30,4,,0,220.0,3,,22,,11,15,,1981,"192320,00$",ManaGER - Research & Development,,1


#### VARIABLES AND FUNCCIONS

In [8]:
#COLUMN RENAMES:

title_mapping = {"employeenumber": "employee_number",
                "gender": "gender",
                "datebirth": "birth_year",
                "age": "age",
                "maritalstatus": "marital_status",
                "jobrole": "job_title",
                "department": "department",
                "attrition": "terminated",
                "standardhours": "standard_hours",
                "monthlyincome": "monthly_income",
                "remotework": "remote",
                "businesstravel": "business_travel",
                "dailyrate": "daily_rate",
                "distancefromhome": "dist_home",
                "educationfield": "education_field",
                "education": "education_scale",
                "environmentsatisfaction": "env_sat_rate",
                "hourlyrate": "hourly_rate",
                "jobinvolvement": "job_involvement",
                "joblevel": "job_level",
                "jobsatisfaction": "job_sat_rate",
                "monthlyrate": "monthly_rate",
                "numcompaniesworked": "num_comp_worked",
                "overtime": "over_time",
                "percentsalaryhike": "perc_salary_hike",
                "performancerating": "perf_rate",
                "relationshipsatisfaction": "relationship_sat_rate",
                "stockoptionlevel": "stock_opt_level",
                "totalworkingyears": "tot_working_year",
                "trainingtimeslastyear": "traning_times_last_year",
                "worklifebalance": "work_life_balance",
                "yearsatcompany": "year_at_comp",
                "yearsincurrentrole": "year_current_role",
                "yearssincelastpromotion": "year_last_promotion",
                "yearswithcurrmanager": "year_current_mngr",
                "salary": "annual_salary",
                "roledepartament": "role_department"}

#CATEGORIES: 

columns_personal =  ['employee_number', 
                    'gender', 
                    'birth_year', 
                    'age', 
                    'marital_status',
                    'dist_home']

columns_job =   ['job_title',
                 'department',
                'terminated',
                'year_at_comp',
                'year_current_role',
                'standard_hours',
                'remote',
                'business_travel',
                'over_time', 
                'job_level', 
                'stock_opt_level', 
                'traning_times_last_year', 
                'perf_rate',
                'year_last_promotion',
                'year_current_mngr']

columns_education = ['education_field',
                    'education_scale']


columns_income =    ['annual_salary',
                    'monthly_income',
                    'daily_rate',
                    'hourly_rate',
                    'monthly_rate',
                    'perc_salary_hike']

columns_satisfaction =  ['env_sat_rate',
                        'job_involvement',
                        'job_sat_rate',
                        'relationship_sat_rate',
                        'work_life_balance']

columns_emp_bgd =   ['num_comp_worked',
                    'tot_working_year']


#COLUMN REORDER:

new_order_columns = columns_personal+columns_job+columns_education+columns_income+columns_satisfaction+columns_emp_bgd

def reorder_columns(df, list_columns):

    #Reorder columns as per provided list, add all the missing ones at the end.
    
    try:
        df = df[new_order_columns]
        extra_columns = [col for col in df.columns if col not in new_order_columns]
        df = df[ new_order_columns + extra_columns ]
    
    except KeyError as e:
        print(f"KeyError: {e}")
        missing_columns = [col for col in new_order_columns if col not in df.columns]
        print(f"Missing columns: {missing_columns}")

        extra_columns = [col for col in df.columns if col not in new_order_columns]
        df = df[ new_order_columns + extra_columns ]
    
    return df


#DATA ANALYSIS

def data_analysis (dataframe, columns):

    print('📌CATEGORY ANALYSIS:\n')
    display(df[columns_personal].describe().T)
    print('\n')

    for col in columns:
        
        print(f'📌Columna: {col.upper()} \n')
        print('Null counter:', dataframe[col].isnull().sum(), '\n')
        print('These are the unique values:', dataframe[col].unique(),'\n')
        print('These are values counter:', dataframe[col].value_counts(dropna=False),'\n')
        print('-------------')


#### STRUCTURE CLEANING

In [None]:
#Splitting column "roledepartament"
#df[["role_1", "department_1"]]=df["roledepartament"].str.split("-", expand=True).get([0, 1])

KeyError: 'roledepartament'

In [5]:
#RENAME
df = df.rename(columns=title_mapping)

In [6]:
df.head(1)

Unnamed: 0,age,terminated,business_travel,daily_rate,department,dist_home,education_scale,education_field,employeecount,employee_number,env_sat_rate,gender,hourly_rate,job_involvement,job_level,job_title,job_sat_rate,marital_status,monthly_income,monthly_rate,num_comp_worked,over18,over_time,perc_salary_hike,perf_rate,relationship_sat_rate,standard_hours,stock_opt_level,tot_working_year,traning_times_last_year,work_life_balance,year_at_comp,year_current_role,year_last_promotion,year_current_mngr,sameasmonthlyincome,birth_year,annual_salary,role_department,numberchildren,remote
0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes


In [9]:
#REORDER
df = reorder_columns(df, new_order_columns)

In [13]:
df.head()

Unnamed: 0,employee_number,gender,birth_year,age,marital_status,dist_home,job_title,department,terminated,year_at_comp,year_current_role,standard_hours,remote,business_travel,over_time,job_level,stock_opt_level,traning_times_last_year,perf_rate,year_last_promotion,year_current_mngr,education_field,education_scale,annual_salary,monthly_income,daily_rate,hourly_rate,monthly_rate,perc_salary_hike,env_sat_rate,job_involvement,job_sat_rate,relationship_sat_rate,work_life_balance,num_comp_worked,tot_working_year
0,1,0,1972,51,,6,Research Director,,No,20,,Full Time,Yes,,No,5,0,5,30,15,15,,3,"195370,00$","16280,83$",2015.722222,,"42330,17$",13,1,3,3,3,30.0,7,
1,2,0,1971,52,,1,Manager,,No,33,,,1,,,5,1,5,30,11,9,Life Sciences,4,"199990,00$",,2063.388889,,"43331,17$",14,3,2,3,1,30.0,0,340.0
2,3,0,1981,42,Married,4,Manager,Research & Development,No,22,,,1,travel_rarely,No,5,0,3,30,11,15,Technical Degree,2,"192320,00$",,1984.253968,,"41669,33$",11,3,3,4,4,,1,220.0
3,4,1,1976,47,Married,2,Research Director,,No,20,,Full Time,False,travel_rarely,,4,2,2,30,5,6,Medical,4,"171690,00$","14307,50$",1771.404762,,"37199,50$",19,1,3,3,2,,3,
4,5,1,1977,46,Divorced,3,Sales Executive,,No,19,,,0,,No,4,1,5,30,2,8,Technical Degree,3,,"12783,92$",1582.771346,,"33238,20$",12,1,4,1,4,30.0,2,


In [12]:
# Title and strip values

to_title = ['marital_status', 'job_title', 'department', 'education_field']

df[to_title] = df[to_title].apply(lambda x: x.str.title().str.strip() if x.dtype == 'object' else x)

In [14]:
# Update 0/1 from Genger to M/F

gender_map = {0: "M", 1:"F"}
df['gender'] = df['gender'].map(gender_map)

In [15]:
# Replacing values obj to numeric in age

age_update = {
    'thirty-two': 32,
    'twenty-four': 24,
    'thirty': 30,
    'fifty-eight': 58,
    'fifty-two': 52,
    'twenty-six': 26,
    'fifty-five': 55,
    'thirty-seven': 37,
    'thirty-six': 36,
    'forty-seven': 47,
    'thirty-one': 31}

df['age'] = df['age'].replace(age_update)


In [16]:
convert_float = ['annual_salary','monthly_income','daily_rate']
def replace_currency(value):
        try:
                value = float(value.replace("$", "").replace(",", "."))
                return value
        except:
            return value
#df[convert_float] = df[convert_float].apply(replace_currency)




for col in convert_float:
     df[col]=df[col].apply(replace_currency)

In [18]:
df["marital_status"]=df["marital_status"].replace("Marreid", "Married")

#### DATA ANALYSYS

##### PERSONAL COLUMNS

In [None]:
data_analysis(df, columns_personal)

##### JOB COLUMNS

In [None]:
data_analysis(df, columns_job)

##### INCOME COLUMNS

In [None]:
data_analysis(df, columns_income)

##### EDUCATION COLUMNS

In [None]:
data_analysis(df,columns_education)

##### SATISFACTION COLUMNS

In [None]:
data_analysis(df,columns_satisfaction)

##### EMPLOYEMENT BACKGROUND COLUMNS

In [None]:
data_analysis(df, columns_emp_bgd)

In [None]:
df.head()