# Attrition 
    Why? Predicting employee attrition (whether an employee will leave the company) is a common HR analytics problem.
    Use Case: Helps in workforce planning, retention strategies, and reducing turnover costs.

In [1]:
# import dataset
import pandas as pd
data_path = r"D:\Human Resources\Human Resources .csv"
data = pd.read_csv(data_path)
data.head(3)


Unnamed: 0,Emp ID,First Name,Middle Initial,Last Name,Gender,E Mail,Father's Name,Mother's Name,Mother's Maiden Name,Date of Birth,...,County,City,State,Zip,Region,User Name,Password,attrition,Job_title,Adjusted_Salary
0,857211,Hermila,J,Suhr,M,hermila.suhr@gmail.com,Todd Suhr,Cathrine Suhr,Hinojosa,9/4/1992,...,Clay,Peach Orchard,AR,72453,South,hjsuhr,oZ%{<6wN!A,0,Data Analyst,124736.1
1,514341,Antonio,Q,Joy,F,antonio.joy@yahoo.com,Clark Joy,Clarisa Joy,Gagliardi,12/24/1989,...,Screven,Rocky Ford,GA,30455,South,aqjoy,7_[%FE;saZ:B,1,Data Analyst,106128.17
2,314598,Sebastian,J,Moores,M,sebastian.moores@ibm.com,Everette Moores,Sol Moores,Paden,9/23/1980,...,Jefferson,Antwerp,NY,13608,Northeast,sjmoores,Z0:_dR*OQhQlF,0,AI Researcher,169345.52


In [2]:
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

# Data Cleaning
# check for missing values
data.isnull().sum()
 
# information about data ( data types)
data.info()

 
# Drop non-predictive columns 
columns_to_drop = ['Emp ID', 'First Name', 'Middle Initial', 'Last Name',
                   'E Mail', 'Father\'s Name', "Mother's Name", "Mother's Maiden Name", 
                   'SSN', 'Phone No. ', 'User Name', 'Password', 'Place Name', 
                   'County', 'Date of Birth', 'Time of Birth', 'Date of Joining','Adjusted_Salary']

filtered_data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])
 

# Remove '%' and convert to float in 'Last % Hike'
filtered_data['Last % Hike'] = (
    filtered_data['Last % Hike']
    .astype(str)
    .str.replace('%', '',regex=False)
    .astype(float)
)


# Define categorical and numerical features
categorical_features = ['Gender', 'City', 'State', 'Zip', 'Region', 
                       'Month Name of Joining', 'Short Month', 'Short DOW',
                       'Quarter of Joining', 'Half of Joining', 'DOW of Joining','Job_title']


numerical_features = ['Age in Yrs.', 'Weight in Kgs.', 'Age in Company (Years)',
                     'Last % Hike', 'Year of Joining', 'Month of Joining',
                     'Day of Joining']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 38 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Emp ID                  500000 non-null  int64  
 1   First Name              500000 non-null  object 
 2   Middle Initial          500000 non-null  object 
 3   Last Name               500000 non-null  object 
 4   Gender                  500000 non-null  object 
 5   E Mail                  500000 non-null  object 
 6   Father's Name           500000 non-null  object 
 7   Mother's Name           500000 non-null  object 
 8   Mother's Maiden Name    500000 non-null  object 
 9   Date of Birth           500000 non-null  object 
 10  Time of Birth           500000 non-null  object 
 11  Age in Yrs.             500000 non-null  float64
 12  Weight in Kgs.          500000 non-null  int64  
 13  Date of Joining         500000 non-null  object 
 14  Quarter of Joining  

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

le = LabelEncoder()  
filtered_data['attrition'] = le.fit_transform(filtered_data['attrition'])

X = filtered_data.drop('attrition', axis=1)
y = filtered_data['attrition']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model =KNeighborsClassifier(n_neighbors=3)

model.fit(X_train_processed, y_train)
y_pred = model.predict(X_test_processed)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
conf_matrix =confusion_matrix(y_test, y_pred)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
labels = ['No Attrition', 'Attrition']

# Plot it 
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()