# ***DATA CLEANING***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/content/sample_data/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
# BASIC UNDERSTANDING OF DATA
df.head()

In [None]:
df.tail()

In [None]:
df.sample(5, random_state=42)

In [None]:
# STRUCTURE
df.shape
df.info()
df.columns.tolist()


In [None]:
df.dtypes

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns

In [None]:
# CHECKING MISSING VALUES
df.isna().sum()
(df.isna().sum() / len(df) * 100).sort_values(ascending=False)
df[df.isna().any(axis=1)]


In [None]:
# DUPLICATES CHECKING

df.duplicated().sum()
df[df.duplicated()]



In [None]:
#STATS
# Basic statistics
df.describe()

# More detailed stats for selected numeric columns
df[num_cols].describe().T
df['Age'].median()


In [None]:
# Unique values per column
df[cat_cols].nunique()

# List unique values of a specific column
df['Attrition'].unique()
df['JobRole'].unique()

##Value counts
df['Attrition'].value_counts()
df['Attrition'].value_counts(normalize=True) * 100


In [None]:
df.to_csv('cleaned_HR_EXMPLOYEE_ATTRITION.csv', index=False)

# ***EXPLORATORY DATA ANALYSIS***

In [None]:
df=pd.read_csv('/content/cleaned_HR_EXMPLOYEE_ATTRITION.csv')
df.head()

In [None]:
df.describe(include="all").T


In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns

df[num_cols].hist(figsize=(15, 12), bins=30, edgecolor='black')
plt.suptitle("Numerical Feature Distributions", size=18)
plt.show()


In [None]:
cat_cols = df.select_dtypes(include=["object", "category"]).columns

plt.figure(figsize=(15, 12))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(len(cat_cols) // 3 + 1, 3, i)
    sns.countplot(data=df, x=col)
    plt.title(col)
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
def plot_numeric_vs_attrition(df, col):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x="Attrition", y=col)
    plt.title(f"{col} vs Attrition")
    plt.xlabel("Attrition")
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()


for col in num_cols:
    plot_numeric_vs_attrition(df, col)


In [None]:
def plot_categorical_vs_attrition(df, col):
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, hue="Attrition")
    plt.title(f"{col} vs Attrition")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45)


    total = len(df)
    for p in plt.gca().patches:
        height = p.get_height()
        plt.gca().annotate(f'{height}',
                           (p.get_x() + p.get_width() / 2., height),
                           ha='center', va='bottom', fontsize=9)
    plt.tight_layout()
    plt.show()


for col in cat_cols:
    plot_categorical_vs_attrition(df, col)


In [None]:
def plot_numeric_vs_attrition(df, col):
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x="Attrition", y=col)
    plt.title(f"{col} vs Attrition")
    plt.xlabel("Attrition")
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()


for col in num_cols:
    plot_numeric_vs_attrition(df, col)


In [None]:
plt.figure(figsize=(14,10))
numeric_cols = df.select_dtypes(include=['int64','float64']).columns

corr = df[numeric_cols].corr()

sns.heatmap(corr, cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap of Numerical Features", fontsize=16)
plt.show()


In [None]:
plt.figure(figsize=(14,6))
sns.countplot(data=df, x="JobRole", hue="Gender", palette="Set2")
plt.title("Job Role Distribution by Gender", fontsize=15)
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(14,6))
sns.countplot(data=df, x="JobRole", hue="Attrition", palette="Set1")
plt.title("Attrition Count by JobRole", fontsize=15)
plt.xticks(rotation=45)
plt.show()


In [None]:
dept_attrition = pd.crosstab(df['Department'], df['Attrition'])

dept_attrition.plot(kind='bar', stacked=True, figsize=(10,6), colormap='Set3')
plt.title("Attrition Distribution by Department", fontsize=15)
plt.xlabel("Department")
plt.ylabel("Count")
plt.show()


In [None]:
plt.figure(figsize=(16,7))
sns.boxplot(data=df, x="JobRole", y="MonthlyIncome", hue="Attrition")
plt.title("Monthly Income Distribution by JobRole and Attrition", fontsize=15)
plt.xticks(rotation=45)
plt.show()


In [None]:
pivot = pd.crosstab(df['JobRole'], df['Attrition'], normalize='index') * 100

plt.figure(figsize=(12,6))
sns.heatmap(pivot, annot=True, cmap="YlOrRd", fmt=".1f")
plt.title("Attrition Rate (%) by Job Role", fontsize=16)
plt.ylabel("Job Role")
plt.show()
