In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# Load dataset
data = pd.read_csv("Attrition Data.csv")
print(data.head())

# Encode categorical columns
le = LabelEncoder()
categorical_cols = ["Attrition", "Department", "EducationField", "MaritalStatus"]

for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

print("After Encoding:\n", data.head())

# Features (independent variables)
X = data[[
    "Age",
    "DistanceFromHome",
    "Education",
    "EnvironmentSatisfaction",
    "JobSatisfaction",
    "MonthlyIncome",
    "NumCompaniesWorked",
    "WorkLifeBalance",
    "YearsAtCompany",
    "Department",
    "EducationField",
    "MaritalStatus"
]]
# Target (dependent variable → whether an employee left)
y = data["Attrition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 1. Decision Tree Classifier
dt = DecisionTreeClassifier(criterion="entropy", random_state=42)
dt.fit(X_train, y_train)
y_dt_pred = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_dt_pred)
print("Decision Tree Accuracy:", accuracy_dt)

# 2. Linear Regression (just for comparison, though attrition is categorical)
lrr = LinearRegression()
lrr.fit(X_train, y_train)
y_lrr_pred = lrr.predict(X_test)

mse = mean_squared_error(y_test, y_lrr_pred)
r2 = r2_score(y_test, y_lrr_pred)

print("Linear Regression MSE:", mse)
print("Linear Regression R2:", r2)