In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for beautiful visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10

# Load the dataset
df = pd.read_csv('content/dataset.csv')
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import LinearSegmentedColormap
from math import pi

# ==========================================
# 0. Setup Theme & Data Generation
# ==========================================

def apply_professional_theme():
    """
    Sets up a clean, high-contrast professional theme for publication.
    """
    plt.style.use('default')

    # Custom Colors for High Contrast (Black/White/Red)
    global C_BG, C_HIGHLIGHT, C_BASE
    C_BG = "#ffffff"      # Pure White
    C_HIGHLIGHT = "#D90429" # Professional Red (Attrition/Risk)
    C_BASE = "#000000"    # Black (Baseline/Retention)

    # Overrides
    plt.rcParams['figure.facecolor'] = C_BG
    plt.rcParams['axes.facecolor'] = C_BG
    plt.rcParams['axes.edgecolor'] = C_BASE
    plt.rcParams['grid.color'] = C_BASE
    plt.rcParams['grid.alpha'] = 0.1
    plt.rcParams['text.color'] = C_BASE
    plt.rcParams['axes.labelcolor'] = C_BASE
    plt.rcParams['xtick.color'] = C_BASE
    plt.rcParams['ytick.color'] = C_BASE
    plt.rcParams['font.family'] = 'sans-serif'

    sns.set_context("paper", font_scale=1.2)



apply_professional_theme()
df = pd.read_csv("content/dataset.csv")
le = LabelEncoder()
df['Attrition_Num'] = le.fit_transform(df['Attrition']) # Yes=1, No=0
attrition_palette = {'Yes': C_HIGHLIGHT, 'No': C_BASE}

# ==========================================
# PLOTTING FUNCTIONS (With Save Logic)
# ==========================================

def save_plot(filename):
    plt.savefig("images_old/" + filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved: {filename}")

def plot_feature_importance():
    df_num = df.select_dtypes(include=[np.number]).drop(['Attrition_Num'], axis=1, errors='ignore')
    X = df_num.fillna(0)
    y = df['Attrition_Num']

    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)

    importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False).head(10)

    plt.figure(figsize=(10, 6))
    sns.barplot(data=importances, x='Importance', y='Feature', palette='Reds_r')
    plt.title('Feature Importance (Random Forest Analysis)')
    plt.xlabel('Mean Decrease in Impurity')
    save_plot('feature_importance.png')

def plot_pca_biplot():
    features = ['Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'PercentSalaryHike']
    x = StandardScaler().fit_transform(df[features].dropna())

    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x)
    pca_df = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2'])
    pca_df['Attrition'] = df['Attrition']

    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Attrition',
                    palette=attrition_palette, alpha=0.7, s=60)

    coeff = np.transpose(pca.components_[0:2, :])
    for i in range(coeff.shape[0]):
        plt.arrow(0, 0, coeff[i,0]*4, coeff[i,1]*4, color=C_BASE, alpha=0.5, width=0.02)
        plt.text(coeff[i,0]*4.5, coeff[i,1]*4.5, features[i], color=C_HIGHLIGHT, ha='center')

    plt.title('PCA Biplot: Risk Clusters vs. Stability')
    save_plot('pca_biplot.png')

def plot_survival_curve():
    plt.figure(figsize=(10, 6))

    def get_km(data):
        T = data['YearsAtCompany']
        times = np.sort(T.unique())
        probs = []
        survival = 1.0
        total = len(data)
        for t in times:
            left = len(data[(data['YearsAtCompany'] == t) & (data['Attrition'] == 'Yes')])
            at_risk = len(data[data['YearsAtCompany'] >= t])
            if at_risk > 0: survival *= (1 - left/at_risk)
            probs.append(survival)
        return times, probs

    t, s = get_km(df)
    plt.step(t, s, where='post', label='Aggregate Retention', linewidth=3, color=C_BASE)

    depts = df['Department'].unique()
    colors = [C_HIGHLIGHT, '#E01E37', '#A4161A']
    for i, d in enumerate(depts):
        td, sd = get_km(df[df['Department'] == d])
        plt.step(td, sd, where='post', label=d, linestyle='--', color=colors[i%3])

    plt.title('Kaplan-Meier Retention Estimates')
    plt.xlabel('Years at Company')
    plt.ylabel('Probability of Retention')
    plt.legend()
    save_plot('survival_curve.png')

def plot_clustermap():
    cols = ['Age', 'MonthlyIncome', 'JobLevel', 'TotalWorkingYears',
            'YearsAtCompany', 'WorkLifeBalance', 'JobSatisfaction']
    corr = df[cols].corr()
    g = sns.clustermap(corr, annot=True, fmt=".2f", cmap='rocket_r', center=0,
                       figsize=(10, 10), tree_kws=dict(colors=C_BASE))
    g.fig.suptitle('Correlation Matrix of Key Drivers', y=1.02)
    save_plot('clustermap.png')

def plot_ridgeline():
    roles = df.groupby('JobRole')['MonthlyIncome'].median().sort_values(ascending=False).index[:6]
    fig, axes = plt.subplots(len(roles), 1, figsize=(8, 10), sharex=True)

    for i, role in enumerate(roles):
        sns.kdeplot(data=df[df['JobRole']==role], x='MonthlyIncome', hue='Attrition',
                    fill=True, palette=attrition_palette, alpha=0.5, ax=axes[i], legend=(i==0))
        axes[i].set_ylabel(role, rotation=0, ha='right', fontsize=9)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
        axes[i].set_yticks([])

    fig.suptitle('Income Distribution by Job Role', y=0.92)
    save_plot('ridgeline.png')

def plot_bubble_chart():
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='YearsAtCompany', y='MonthlyIncome',
                    size='Age', hue='Attrition', sizes=(20, 400),
                    palette=attrition_palette, alpha=0.6)
    plt.title('Multivariate Analysis: Tenure, Income, and Age')
    save_plot('bubble_chart.png')

def plot_feature_interaction():
    plt.figure(figsize=(8, 6))
    sns.pointplot(data=df, x='JobSatisfaction', y='Attrition_Num', hue='OverTime',
                  palette={'Yes': C_HIGHLIGHT, 'No': C_BASE}, capsize=.1)
    plt.title('Interaction Effect: Overtime vs. Satisfaction')
    plt.ylabel('Attrition Probability')
    save_plot('interaction_plot.png')

def plot_boxen():
    plt.figure(figsize=(10, 6))
    sns.boxenplot(data=df, x='JobRole', y='MonthlyIncome', hue='Attrition', palette=attrition_palette)
    plt.xticks(rotation=45, ha='right')
    plt.title('Compensation Variance Analysis')
    save_plot('boxen_plot.png')

def plot_radar():
    features = ['JobSatisfaction', 'EnvironmentSatisfaction', 'WorkLifeBalance', 'JobInvolvement']
    data = df.groupby('Attrition')[features].mean().reset_index()

    categories = features
    N = len(categories)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    plt.xticks(angles[:-1], categories, color=C_BASE)

    # Stayed
    values1 = data[data['Attrition']=='No'][features].values.flatten().tolist()
    values1 += values1[:1]
    ax.plot(angles, values1, linewidth=2, linestyle='solid', label='Retained', color=C_BASE)
    ax.fill(angles, values1, C_BASE, alpha=0.1)

    # Left
    values2 = data[data['Attrition']=='Yes'][features].values.flatten().tolist()
    values2 += values2[:1]
    ax.plot(angles, values2, linewidth=2, linestyle='solid', label='Departed', color=C_HIGHLIGHT)
    ax.fill(angles, values2, C_HIGHLIGHT, alpha=0.1)

    plt.title('Psychometric Profile Comparison')
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    save_plot('radar_chart.png')

def plot_dual_axis():
    df['AgeGroup'] = pd.cut(df['Age'], bins=[18, 30, 40, 50, 60], labels=['18-30', '31-40', '41-50', '50+'])
    agg = df.groupby('AgeGroup', observed=True).agg({'EmployeeNumber': 'count', 'Attrition_Num': 'mean'})

    fig, ax1 = plt.subplots(figsize=(10, 6))

    sns.barplot(x=agg.index, y=agg['EmployeeNumber'], ax=ax1, color=C_BASE, alpha=0.3, label='Headcount')
    ax1.set_ylabel('Headcount', color=C_BASE)

    ax2 = ax1.twinx()
    sns.lineplot(x=agg.index, y=agg['Attrition_Num']*100, ax=ax2, color=C_HIGHLIGHT, marker='o', lw=3, label='Attrition Rate')
    ax2.set_ylabel('Attrition Rate (%)', color=C_HIGHLIGHT)
    ax2.tick_params(axis='y', labelcolor=C_HIGHLIGHT)

    plt.title('Demographic Risk Analysis')
    save_plot('dual_axis.png')

if __name__ == "__main__":
    plot_feature_importance()
    plot_pca_biplot()
    plot_survival_curve()
    plot_clustermap()
    plot_ridgeline()
    plot_bubble_chart()
    plot_feature_interaction()
    plot_boxen()
    plot_radar()
    plot_dual_axis()

Saved: feature_importance.png
Saved: pca_biplot.png
Saved: survival_curve.png
Saved: clustermap.png
Saved: ridgeline.png
Saved: bubble_chart.png
Saved: interaction_plot.png
Saved: boxen_plot.png
Saved: radar_chart.png
Saved: dual_axis.png


In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from math import pi

# ==========================================
# 1. Setup Professional Theme
# ==========================================
def apply_theme():
    plt.style.use('default')
    # Corporate High-Contrast Theme
    global C_BG, C_HIGHLIGHT, C_BASE, C_SEC
    C_BG = "#ffffff"      # White Background
    C_HIGHLIGHT = "#D90429" # Alarm Red (Attrition/Risk)
    C_BASE = "#000000"    # Black (Structure/Text)
    C_SEC = "#4A4E69"     # Dark Grey (Secondary Context)

    plt.rcParams['figure.facecolor'] = C_BG
    plt.rcParams['axes.facecolor'] = C_BG
    plt.rcParams['axes.edgecolor'] = C_BASE
    plt.rcParams['text.color'] = C_BASE
    plt.rcParams['axes.labelcolor'] = C_BASE
    plt.rcParams['xtick.color'] = C_BASE
    plt.rcParams['ytick.color'] = C_BASE
    plt.rcParams['font.family'] = 'sans-serif'
    sns.set_context("paper", font_scale=1.4)

apply_theme()
attrition_palette = {'Yes': C_HIGHLIGHT, 'No': C_BASE}

# Load & Prep Data
try:
    df = pd.read_csv('content/dataset.csv')
except:
    df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

le = LabelEncoder()
df['Attrition_Num'] = le.fit_transform(df['Attrition']) 

# ==========================================
# 2. Plotting Functions
# ==========================================

def save_slide_plot(filename):
    plt.savefig("images/" + filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Generated: {filename}")

# --- Slide 4: Feature Importance ---
def plot_slide_4():
    df_num = df.select_dtypes(include=[np.number]).drop(['Attrition_Num', 'EmployeeCount', 'StandardHours'], axis=1, errors='ignore')
    X = df_num.fillna(0)
    y = df['Attrition_Num']
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_}).sort_values('Importance', ascending=False).head(10)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=importances, x='Importance', y='Feature', palette='Reds_r')
    plt.title('Top 10 Drivers of Attrition (Random Forest)', fontweight='bold')
    plt.xlabel('Mean Decrease in Impurity')
    save_slide_plot('slide_4_feature_importance.png')

# --- Slide 5: Age Risk (Dual Axis) ---
def plot_slide_5():
    df['AgeGroup'] = pd.cut(df['Age'], bins=[18, 30, 40, 50, 60], labels=['18-30', '31-40', '41-50', '50+'])
    agg = df.groupby('AgeGroup', observed=True).agg({'EmployeeNumber': 'count', 'Attrition_Num': 'mean'})

    fig, ax1 = plt.subplots(figsize=(12, 6))
    sns.barplot(x=agg.index, y=agg['EmployeeNumber'], ax=ax1, color='lightgrey', alpha=0.5, label='Headcount')
    ax1.set_ylabel('Headcount')
    
    ax2 = ax1.twinx()
    sns.lineplot(x=agg.index, y=agg['Attrition_Num']*100, ax=ax2, color=C_HIGHLIGHT, marker='o', lw=4, label='Attrition Rate')
    ax2.set_ylabel('Attrition Rate (%)', color=C_HIGHLIGHT, fontweight='bold')
    ax2.tick_params(axis='y', labelcolor=C_HIGHLIGHT)
    
    plt.title('The "Danger Zone": Attrition Rate vs. Headcount', fontweight='bold')
    save_slide_plot('slide_5_age_risk.png')

# --- Slide 6: Income Distribution ---
def plot_slide_6():
    roles = df.groupby('JobRole')['MonthlyIncome'].median().sort_values(ascending=False).index[:6]
    fig, axes = plt.subplots(len(roles), 1, figsize=(10, 8), sharex=True)
    
    for i, role in enumerate(roles):
        sns.kdeplot(data=df[df['JobRole']==role], x='MonthlyIncome', hue='Attrition',
                    fill=True, palette=attrition_palette, alpha=0.6, ax=axes[i], legend=(i==0))
        axes[i].set_ylabel(role, rotation=0, ha='right', fontsize=10)
        axes[i].set_yticks([])
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
        if i == 0: axes[i].legend(loc='upper right', title='Attrition')

    fig.suptitle('Income Inequality: Sales vs. Management', y=0.95, fontweight='bold')
    save_slide_plot('slide_6_income_dist.png')

# --- Slide 7: Overtime Interaction ---
def plot_slide_7():
    plt.figure(figsize=(10, 6))
    sns.pointplot(data=df, x='JobSatisfaction', y='Attrition_Num', hue='OverTime',
                  palette={'Yes': C_HIGHLIGHT, 'No': C_BASE}, markers=['o', 's'], scale=1.5)
    plt.title('The "Overtime Trap": Satisfaction vs. Burnout', fontweight='bold')
    plt.ylabel('Attrition Probability')
    plt.ylim(0, 0.5)
    save_slide_plot('slide_7_interaction.png')

# --- Slide 8: Departmental Survival (UPDATED) ---
def plot_slide_8_v2():
    plt.figure(figsize=(12, 6))
    
    def get_km(data):
        times = np.sort(data['YearsAtCompany'].unique())
        probs = []
        survival = 1.0
        for t in times:
            left = len(data[(data['YearsAtCompany'] == t) & (data['Attrition'] == 'Yes')])
            at_risk = len(data[data['YearsAtCompany'] >= t])
            if at_risk > 0: survival *= (1 - left/at_risk)
            probs.append(survival)
        return times, probs

    # Plot Aggregate
    t_all, s_all = get_km(df)
    plt.step(t_all, s_all, where='post', label='Company Avg', linewidth=4, color='black', alpha=0.2)

    # Plot Departments
    dept_styles = {
        'Sales': {'color': C_HIGHLIGHT, 'lw': 3, 'label': 'Sales (High Risk)'},
        'Research & Development': {'color': C_SEC, 'lw': 3, 'label': 'R&D (Stable)'},
        'Human Resources': {'color': '#999999', 'lw': 2, 'label': 'HR', 'style': ':'}
    }

    for dept, style in dept_styles.items():
        subset = df[df['Department'] == dept]
        if len(subset) > 0:
            t, s = get_km(subset)
            ls = style.get('style', '-')
            plt.step(t, s, where='post', label=style['label'], 
                     linewidth=style['lw'], color=style['color'], linestyle=ls)

    # Marker for 5-Year Cliff
    plt.axvline(x=5, color=C_HIGHLIGHT, linestyle='--', alpha=0.5)
    plt.text(5.2, 0.65, '5-Year Cliff', color=C_HIGHLIGHT, fontweight='bold')

    plt.title('Retention Trajectories: Sales vs. R&D', fontweight='bold')
    plt.xlabel('Years at Company')
    plt.ylabel('Probability of Retention')
    plt.legend(loc='lower left')
    save_slide_plot('slide_8_survival_v2.png')

# --- Slide 9: Radar Chart ---
def plot_slide_9():
    features = ['JobSatisfaction', 'EnvironmentSatisfaction', 'WorkLifeBalance', 'JobInvolvement']
    data = df.groupby('Attrition')[features].mean().reset_index()
    categories = features
    N = len(categories)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
    
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    plt.xticks(angles[:-1], categories, color=C_BASE, size=10)
    
    # Retained
    v1 = data[data['Attrition']=='No'][features].values.flatten().tolist()
    v1 += v1[:1]
    ax.plot(angles, v1, linewidth=2, linestyle='solid', color=C_BASE, label='Retained')
    ax.fill(angles, v1, C_BASE, alpha=0.05)
    
    # Departed
    v2 = data[data['Attrition']=='Yes'][features].values.flatten().tolist()
    v2 += v2[:1]
    ax.plot(angles, v2, linewidth=2, linestyle='solid', color=C_HIGHLIGHT, label='Departed')
    ax.fill(angles, v2, C_HIGHLIGHT, alpha=0.1)
    
    plt.title('Psychometric Profile: The "Shrinkage" Effect', y=1.08, fontweight='bold')
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    save_slide_plot('slide_9_radar.png')

    # --- Slide 2: Class Imbalance (NEW DONUT CHART) ---
def plot_slide_2():
    # Calculate counts
    counts = df['Attrition'].value_counts()
    # Ensure order is [Retention (No), Attrition (Yes)]
    sizes = [counts['No'], counts['Yes']]
    labels = [f'Retention\n({sizes[0]})', f'Attrition\n({sizes[1]})']
    colors = [C_BASE, C_HIGHLIGHT] # Black vs Red
    
    fig, ax = plt.subplots(figsize=(8, 8))
    
    # Create Donut Chart
    wedges, texts, autotexts = ax.pie(sizes, labels=labels, colors=colors, 
                                      autopct='%1.1f%%', startangle=45, pctdistance=0.85,
                                      wedgeprops=dict(width=0.4, edgecolor='white'),
                                      textprops={'fontsize': 14, 'weight': 'bold'})
    
    # Style the text
    # Outer labels (Retention/Attrition) in Black/Red
    texts[0].set_color(C_BASE)
    texts[1].set_color(C_HIGHLIGHT)
    
    # Inner percentages (84%/16%) in White
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_weight('bold')
        autotext.set_fontsize(16)
        
    # Center Circle Text
    ax.text(0, 0, '16%\nRisk', ha='center', va='center', fontsize=24, fontweight='bold', color=C_HIGHLIGHT)
    
    plt.title('The Dataset: Class Imbalance', fontweight='bold', fontsize=16)
    save_slide_plot('slide_2_imbalance.png')

# Execution
if __name__ == "__main__":
    plot_slide_4()
    plot_slide_5()
    plot_slide_6()
    plot_slide_7()
    plot_slide_8_v2()
    plot_slide_9()
    plot_slide_2()

Generated: slide_4_feature_importance.png
Generated: slide_5_age_risk.png
Generated: slide_6_income_dist.png
Generated: slide_7_interaction.png
Generated: slide_8_survival_v2.png
Generated: slide_9_radar.png
Generated: slide_2_imbalance.png
