In [1]:
import pandas as pd
import numpy as np
from profiler import profiler

# CREATE DATAFRAME
df = pd.read_excel("Employee Turnover Dataset.xlsx")

# REMOVE DUPLICATE ROWS
df = df.drop_duplicates()

# CONVERT NoneType TO NaN AND REPLACE BLANKS WITH NaN
df = df.map(lambda x: np.nan if x is None or (isinstance(x, str) and x.strip() == '') else x)

In [2]:
# AGE COLUMN - Removes ages outside reasonable range
df.loc[(df['Age'] > 90) | (df['Age'] < 16), 'Age'] = np.NaN

In [3]:
# BUSINESSTRAVEL COLUMN - Make NaN if value doesn't match 1 of the 3 categories 
df.loc[~df['BusinessTravel'].isin(['Non-Travel', 'Travel_Frequently', 'Travel_Rarely']), 'BusinessTravel'] = np.nan

In [4]:
# DISTANCEFROMHOME COLUMN - Change abnormal values to NaN
df.loc[(df['DistanceFromHome'] > 90), 'DistanceFromHome'] = np.NaN

In [5]:
## EMPLOYEECOUNT COLUMN - Should always be 1
df['EmployeeCount'] = df['EmployeeCount'].map(lambda x: 1 if x != 1 else x)

In [6]:
## MONTHLYINCOME - Change negative values to absolute values
df['MonthlyIncome'] = df['MonthlyIncome'].abs()

In [7]:
# MONTHLYRATE COLUMN - Change abnormal values to NaN
df.loc[(df['MonthlyRate'] > 5500100), 'MonthlyRate'] = np.NaN

In [8]:
## TOTALWORKINGYEARS - Convert negative numbers to NaN
df.loc[(df['TotalWorkingYears'] < 0), 'TotalWorkingYears'] = np.NaN

In [9]:
## TRAININGTIMESLASTYEAR - Convert NaN hours to 0 hours, has many NaN values but no zero values 
df.loc[pd.isna(df['TrainingTimesLastYear']), 'TrainingTimesLastYear'] = 0

In [10]:
## YEARSWITHCURRENTMANAGER - Convert outlying data to NaN

# Replace non-integer values with NaN
df.loc[~df['YearsWithCurrManager'].apply(lambda x: isinstance(x, int)), 'YearsWithCurrManager'] = np.NaN

# Replace values less than 0 or greater than 100 with NaN
df.loc[(df['YearsWithCurrManager'] < 0) | (df['YearsWithCurrManager'] > 100), 'YearsWithCurrManager'] = np.NaN

In [11]:
## CUSTOM PROFILER SCRIPT
profiler(df, 'Gender')

#df.to_excel('cleaned_data.xlsx', index=False)

Values in 'Gender' have mixed types: str, float.

Number of NaN or Null values in 'Gender': 3
Number of blank space values in 'Gender': 0

Unique values in 'Gender' (sorted):
[nan, 'Female', 'Male']

Are there null values in 'Gender'? Yes

Describe output for 'Gender':
       Gender
count   10021
unique      2
top      Male
freq     5028


Unnamed: 0,Gender
0,Female
1,Male
2,Female
3,Female
4,Female
...,...
10316,Male
10317,Male
10318,Female
10320,Male
