# 🧠 IBM HR Analytics Project
**Goal:** Predict and understand employee attrition using visual storytelling, hypothesis testing, and machine learning.
Dataset: WA_Fn-UseC_-HR-Employee-Attrition.csv

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, chi2_contingency
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
df['Attrition_n'] = df['Attrition'].map({'Yes': 1, 'No': 0})
sns.set(style="whitegrid")


### 📊 Attrition Count

In [None]:
sns.countplot(data=df, x='Attrition', palette='pastel'); plt.title('Attrition Count'); plt.show()

### 📊 Attrition by Gender

In [None]:
sns.countplot(data=df, x='Gender', hue='Attrition', palette='Set2'); plt.title('Attrition by Gender'); plt.show()

### 📊 Monthly Income vs Attrition

In [None]:
sns.boxplot(data=df, x='Attrition', y='MonthlyIncome', palette='coolwarm'); plt.title('Monthly Income vs Attrition'); plt.show()

### 📊 Attrition by Job Role

In [None]:
sns.countplot(data=df, y='JobRole', hue='Attrition', palette='Set3'); plt.title('Attrition by Job Role'); plt.show()

### 📊 Years at Company KDE by Attrition

In [None]:
sns.kdeplot(data=df, x='YearsAtCompany', hue='Attrition_n', fill=True, palette='crest'); plt.title('Years at Company KDE'); plt.show()

### 📊 Correlation Heatmap

In [None]:
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, fmt='.2f', cmap='coolwarm'); plt.title('Correlation Heatmap'); plt.show()

### 🧪 Hypothesis Testing

In [None]:
# T-test: MonthlyIncome
left = df[df['Attrition'] == 'Yes']['MonthlyIncome']
stayed = df[df['Attrition'] == 'No']['MonthlyIncome']
t_stat, p_val = ttest_ind(left, stayed, equal_var=False)
print(f"T-test: MonthlyIncome\nT-statistic = {t_stat:.2f}, p-value = {p_val:.4f}")
print("→ Statistically significant difference." if p_val < 0.05 else "→ No significant difference.")

# Chi-square: OverTime vs Attrition
contingency_table = pd.crosstab(df['OverTime'], df['Attrition'])
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"Chi-square test: OverTime vs Attrition\nChi2 = {chi2:.2f}, p-value = {p:.4f}")
print("→ Significant association." if p < 0.05 else "→ No significant association.")


### 🤖 Machine Learning Model - Random Forest

In [None]:
# Encode categorical variables
df_enc = df.copy()
le = LabelEncoder()
for col in df_enc.select_dtypes(include='object').columns:
    df_enc[col] = le.fit_transform(df_enc[col])

X = df_enc.drop(['Attrition', 'Attrition_n', 'EmployeeNumber', 'Over18', 'StandardHours', 'EmployeeCount'], axis=1)
y = df_enc['Attrition_n']

# Scale and split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))


### 📤 Export for Power BI

In [None]:
# Export processed data
processed_df = pd.DataFrame(X_scaled, columns=X.columns)
processed_df['Attrition'] = y.values
processed_df.to_csv("processed_hr_attrition.csv", index=False)
print("✔️ Exported: processed_hr_attrition.csv")


### 📌 Key Insights & Summary
- OverTime and Attrition have a strong relationship (Chi-square significant).
- Employees who left had significantly lower MonthlyIncome (T-test significant).
- Most attrition occurs in early years (0–3 YearsAtCompany).
- Random Forest shows strong classification power for predicting attrition.
- Data exported for Power BI for advanced dashboard visualization.