In [None]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
from IPython.display import display
plt.rcParams['figure.figsize'] = (10,5)
def find_column(df, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    for cand in candidates:
        for col in df.columns:
            if cand.lower() in col.lower():
                return col
    return None


In [None]:

data_dir = '/mnt/data'
filepath = None
files = [f for f in os.listdir(data_dir) if f.lower().endswith('.csv')]
if files:
    filepath = os.path.join(data_dir, files[0])
else:
    raise FileNotFoundError('No CSV files found in /mnt/data.')
df = pd.read_csv(filepath)
print(df.shape)
display(df.head(10))


In [None]:

missing_counts = df.isnull().sum().sort_values(ascending=False)
display(missing_counts[missing_counts>0])
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in df.columns if c not in num_cols]
if len(missing_counts[missing_counts>0]) > 0:
    if num_cols:
        for c in num_cols:
            if df[c].isnull().any():
                df[c] = df[c].fillna(df[c].median())
    for c in cat_cols:
        if df[c].isnull().any():
            modes = df[c].mode()
            fill = modes.iloc[0] if not modes.empty else 'Unknown'
            df[c] = df[c].fillna(fill)
display(df.isnull().sum().sum())


In [None]:

num_desc = df.select_dtypes(include=[np.number]).describe().T
num_median = df.select_dtypes(include=[np.number]).median().rename('median')
num_stats = num_desc.join(num_median)
display(num_stats[['count','mean','median','std','min','max']])


In [None]:

dept_candidates = ['department','dept','department_name','dept_name','team']
salary_candidates = ['salary','monthlyincome','monthly_income','MonthlyIncome','annualsalary','annual_salary','pay']
dept_col = find_column(df, dept_candidates)
salary_col = find_column(df, salary_candidates)
print('dept_col:', dept_col)
print('salary_col:', salary_col)
if dept_col is not None and salary_col is not None:
    df[salary_col] = pd.to_numeric(df[salary_col], errors='coerce')
    dept_salary = df.groupby(dept_col)[salary_col].mean().sort_values(ascending=False)
    display(dept_salary.reset_index().rename(columns={salary_col:'avg_salary'}))
    highest = dept_salary.idxmax()
    lowest = dept_salary.idxmin()
    print('Highest average salary department:', highest, float(dept_salary.max()))
    print('Lowest average salary department:', lowest, float(dept_salary.min()))
else:
    print('Department or salary column not found. Columns:', list(df.columns))


In [None]:

num_df = df.select_dtypes(include=[np.number])
if num_df.shape[1] >= 2:
    corr = num_df.corr()
    display(corr)
    fig, ax = plt.subplots(figsize=(10,8))
    cax = ax.imshow(corr.values, interpolation='nearest')
    ax.set_xticks(np.arange(len(corr.columns)))
    ax.set_yticks(np.arange(len(corr.index)))
    ax.set_xticklabels(corr.columns, rotation=45, ha='right')
    ax.set_yticklabels(corr.index)
    for (i, j), val in np.ndenumerate(corr.values):
        ax.text(j, i, f'{val:.2f}', ha='center', va='center', fontsize=8)
    fig.colorbar(cax)
    plt.tight_layout()
    plt.show()
else:
    print('Not enough numerical columns to compute correlations.')


In [None]:

left_candidates = ['left','attrition','is_left','target','left_company','status']
left_col = find_column(df, left_candidates)
if left_col is None:
    for col in df.columns:
        uniques = df[col].dropna().unique()
        if len(uniques) <= 3 and set([str(x).lower() for x in uniques]).intersection({'0','1','yes','no','true','false','left','stayed','y','n'}):
            left_col = col
            break
print('left_col:', left_col)
if left_col is not None:
    vals = df[left_col].copy()
    if vals.dtype == 'O' or vals.dtype.name=='category':
        vals = vals.astype(str).str.lower().map({'yes':1,'no':0,'y':1,'n':0,'true':1,'false':0,'left':1,'stayed':0}).fillna(vals)
    vals = pd.to_numeric(vals, errors='coerce').fillna(vals)
    left_flag = (vals==1)
    counts = left_flag.value_counts().rename(index={True:'Left', False:'Stayed'})
    display(counts)
    fig, ax = plt.subplots(figsize=(6,4))
    counts.plot(kind='bar', ax=ax)
    ax.set_ylabel('Number of employees')
    ax.set_title('Employees: Left vs Stayed')
    plt.tight_layout()
    plt.show()
else:
    print('Attrition/left column not found.')


In [None]:

if dept_col is None or left_col is None:
    print('Cannot compute department attrition rates because department or left column is missing.')
else:
    vals = df[left_col].copy()
    if vals.dtype == 'O' or vals.dtype.name=='category':
        vals = vals.astype(str).str.lower().map({'yes':1,'no':0,'y':1,'n':0,'true':1,'false':0,'left':1,'stayed':0}).fillna(vals)
    vals = pd.to_numeric(vals, errors='coerce').fillna(vals)
    left_flag = (vals==1).astype(int)
    df['_left_flag_tmp'] = left_flag
    dept_attr = df.groupby(dept_col)['_left_flag_tmp'].agg(['sum','count','mean']).rename(columns={'sum':'num_left','mean':'attrition_rate'})
    dept_attr = dept_attr.sort_values('attrition_rate', ascending=False)
    display(dept_attr.reset_index().head(10))
    top = dept_attr.head(10)
    fig, ax = plt.subplots(figsize=(10,5))
    ax.bar(top.index.astype(str), top['attrition_rate'])
    ax.set_xticklabels(top.index.astype(str), rotation=45, ha='right')
    ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
    ax.set_ylabel('Attrition rate (%)')
    ax.set_title('Top departments by attrition rate')
    plt.tight_layout()
    plt.show()
    df.drop(columns=['_left_flag_tmp'], inplace=True)


In [None]:
print('EDA Section 1 completed')