In [3]:
import pandas as pd

# Load the dataset
file_path = '/content/HR Data.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary of the dataset
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

(   Age Attrition     BusinessTravel  DailyRate              Department  \
 0   41       Yes      Travel_Rarely       1102                   Sales   
 1   49        No  Travel_Frequently        279  Research & Development   
 2   37       Yes      Travel_Rarely       1373  Research & Development   
 3   33        No  Travel_Frequently       1392  Research & Development   
 4   27        No      Travel_Rarely        591  Research & Development   
 
    DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
 0                 1          2  Life Sciences              1               1   
 1                 8          1  Life Sciences              1               2   
 2                 2          2          Other              1               4   
 3                 3          4  Life Sciences              1               5   
 4                 2          1        Medical              1               7   
 
    ...  RelationshipSatisfaction StandardHours  StockOption

In [4]:
# Removing unnecessary columns
columns_to_remove = ['EmployeeCount', 'Over18', 'StandardHours']
data_cleaned = data.drop(columns=columns_to_remove)

# Display the columns after removal
data_cleaned.columns.tolist()


['Age',
 'Attrition',
 'BusinessTravel',
 'DailyRate',
 'Department',
 'DistanceFromHome',
 'Education',
 'EducationField',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [5]:
# Renaming the columns
new_column_names = {
    'Age': 'Age',
    'Attrition': 'Attrition',
    'BusinessTravel': 'Travel',
    'DailyRate': 'Daily_Rate',
    'Department': 'Department',
    'DistanceFromHome': 'Distance_Home',
    'Education': 'Education',
    'EducationField': 'Field_of_Education',
    'EmployeeNumber': 'Emp_ID',
    'EnvironmentSatisfaction': 'Env_Satisfaction',
    'Gender': 'Gender',
    'HourlyRate': 'Hourly_Rate',
    'JobInvolvement': 'Job_Involvement',
    'JobLevel': 'Job_Level',
    'JobRole': 'Job_Role',
    'JobSatisfaction': 'Job_Satisfaction',
    'MaritalStatus': 'Marital_Status',
    'MonthlyIncome': 'Monthly_Income',
    'MonthlyRate': 'Monthly_Rate',
    'NumCompaniesWorked': 'Num_Companies_Worked',
    'OverTime': 'Overtime',
    'PercentSalaryHike': 'Salary_Hike_Percent',
    'PerformanceRating': 'Performance_Rating',
    'RelationshipSatisfaction': 'Rel_Satisfaction',
    'StockOptionLevel': 'Stock_Options',
    'TotalWorkingYears': 'Total_Work_Years',
    'TrainingTimesLastYear': 'Training_Last_Year',
    'WorkLifeBalance': 'Work_Life_Balance',
    'YearsAtCompany': 'Years_At_Company',
    'YearsInCurrentRole': 'Years_Current_Role',
    'YearsSinceLastPromotion': 'Years_Since_Promotion',
    'YearsWithCurrManager': 'Years_With_Manager'
}

data_cleaned.rename(columns=new_column_names, inplace=True)

# Display the columns after renaming
data_cleaned.columns.tolist()


['Age',
 'Attrition',
 'Travel',
 'Daily_Rate',
 'Department',
 'Distance_Home',
 'Education',
 'Field_of_Education',
 'Emp_ID',
 'Env_Satisfaction',
 'Gender',
 'Hourly_Rate',
 'Job_Involvement',
 'Job_Level',
 'Job_Role',
 'Job_Satisfaction',
 'Marital_Status',
 'Monthly_Income',
 'Monthly_Rate',
 'Num_Companies_Worked',
 'Overtime',
 'Salary_Hike_Percent',
 'Performance_Rating',
 'Rel_Satisfaction',
 'Stock_Options',
 'Total_Work_Years',
 'Training_Last_Year',
 'Work_Life_Balance',
 'Years_At_Company',
 'Years_Current_Role',
 'Years_Since_Promotion',
 'Years_With_Manager']

In [6]:
# Check for duplicate rows
duplicates = data_cleaned.duplicated().sum()

# Remove duplicate rows if any
data_cleaned = data_cleaned.drop_duplicates()

# Confirm the removal of duplicates
print(duplicates, data_cleaned.shape)


0 (1470, 32)


In [7]:
# Convert categorical columns to lowercase
categorical_columns = ['Attrition', 'Travel', 'Department', 'Field_of_Education', 'Gender', 'Job_Role', 'Marital_Status', 'Overtime']
for col in categorical_columns:
    data_cleaned[col] = data_cleaned[col].str.lower()


In [8]:
# Check for NaN values
nan_values = data_cleaned.isna().sum()
print(nan_values)


Age                      0
Attrition                0
Travel                   0
Daily_Rate               0
Department               0
Distance_Home            0
Education                0
Field_of_Education       0
Emp_ID                   0
Env_Satisfaction         0
Gender                   0
Hourly_Rate              0
Job_Involvement          0
Job_Level                0
Job_Role                 0
Job_Satisfaction         0
Marital_Status           0
Monthly_Income           0
Monthly_Rate             0
Num_Companies_Worked     0
Overtime                 0
Salary_Hike_Percent      0
Performance_Rating       0
Rel_Satisfaction         0
Stock_Options            0
Total_Work_Years         0
Training_Last_Year       0
Work_Life_Balance        0
Years_At_Company         0
Years_Current_Role       0
Years_Since_Promotion    0
Years_With_Manager       0
dtype: int64


In [9]:
# Save the cleaned dataset
data_cleaned.to_csv('HR Data_Cleaned.csv', index=False)


In [10]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical columns to numeric using Label Encoding
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data_cleaned[col] = le.fit_transform(data_cleaned[col])
    label_encoders[col] = le


In [11]:
from sklearn.preprocessing import StandardScaler

# List of numeric columns to scale
numeric_columns = ['Age', 'Daily_Rate', 'Distance_Home', 'Hourly_Rate', 'Monthly_Income',
                   'Monthly_Rate', 'Num_Companies_Worked', 'Salary_Hike_Percent',
                   'Total_Work_Years', 'Training_Last_Year', 'Years_At_Company',
                   'Years_Current_Role', 'Years_Since_Promotion', 'Years_With_Manager']

scaler = StandardScaler()
data_cleaned[numeric_columns] = scaler.fit_transform(data_cleaned[numeric_columns])


In [12]:
# Example using Z-score for outlier detection
from scipy.stats import zscore

# Compute z-scores of numeric columns
z_scores = data_cleaned[numeric_columns].apply(zscore)

# Define a threshold for outliers
threshold = 3

# Remove rows with outliers
data_cleaned = data_cleaned[(z_scores < threshold).all(axis=1)]


In [13]:
from imblearn.over_sampling import SMOTE

# Assuming 'Attrition' is the target variable
X = data_cleaned.drop('Attrition', axis=1)
y = data_cleaned['Attrition']

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled data back into a single dataframe
data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
data_resampled['Attrition'] = y_resampled
