In [None]:
# %%
# ===============================================================
# Final Advanced Enhanced EDA Code for Employee Productivity Analysis
# ===============================================================

# Enable inline plotting for Matplotlib
get_ipython().run_line_magic('matplotlib', 'inline')


# Set Plotly renderer for notebooks
import plotly.io as pio

pio.renderers.default = 'notebook_connected'

# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os

# Import AutoViz for enhanced profiling
from autoviz.AutoViz_Class import AutoViz_Class

# Set global visualization parameters
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# ---------------------------------------------------------------
# 1. Data Loading
# ---------------------------------------------------------------
data_path = r"D:\Python\Machine Learning\Datasets\Extended_Employee_Performance_and_Productivity_Data.csv"
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print("Data loaded successfully from:", data_path)
else:
    print(f"File not found: {data_path}")
    raise SystemExit("Stopping execution due to missing dataset.")

print("Dataset shape:", df.shape)
print("Dataset columns:", df.columns.tolist())

# ---------------------------------------------------------------
# 2. Basic Inspection & Enhanced Data Quality Analysis
# ---------------------------------------------------------------
print("\n--- DataFrame Head ---")
print(df.head())

print("\n--- DataFrame Info ---")
df.info()

print("\n--- Summary Statistics ---")
print(df.describe())

print("\n--- Missing Values Count ---")
print(df.isnull().sum())

# ---------------------------------------------------------------
# 3. AutoViz for Advanced Data Profiling
# ---------------------------------------------------------------
print("\n--- AutoViz Report Generation ---")
AV = AutoViz_Class()
autoviz_report = AV.AutoViz(data_path, depVar="", verbose=1, chart_format="svg", save_plot_dir=None)
print("AutoViz report generated successfully.")

# ---------------------------------------------------------------
# 4. Synthetic Data Generation Without SDV
# ---------------------------------------------------------------
print("\n--- Synthetic Data Generation Without SDV ---")


def generate_synthetic_data(df, sample_size=100):
    """
    Generate synthetic data based on statistical properties of the original dataset.
    Args:
    - df (DataFrame): The original DataFrame.
    - sample_size (int): Number of synthetic rows to generate.

    Returns:
    Synthetic DataFrame resembling the original dataset.
    """
    synthetic_data = pd.DataFrame()
    for column in df.columns:
        if df[column].dtype in [np.float64, np.int64]:  # Numeric columns
            mean, std = df[column].mean(), df[column].std()
            synthetic_data[column] = np.random.normal(loc=mean, scale=std, size=sample_size)
        elif df[column].dtype == 'object':  # Categorical columns
            synthetic_data[column] = np.random.choice(df[column].dropna().unique(), size=sample_size)
        else:  # Handle other data types if required
            synthetic_data[column] = np.nan
    return synthetic_data


# Generate synthetic data based on the original dataset
synthetic_df = generate_synthetic_data(df, sample_size=100)

print("Synthetic Data Sample:")
print(synthetic_df.head())

# ---------------------------------------------------------------
# 5. Data Cleaning
# ---------------------------------------------------------------
print("\n--- Data Cleaning ---")
df.drop_duplicates(inplace=True)
print("After removing duplicates, shape:", df.shape)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Filled missing values in {col} with median: {median_val}")


# Remove outliers using IQR method
def remove_outliers_iqr(dataframe, column):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    before = dataframe.shape[0]
    dataframe = dataframe[(dataframe[column] >= lower_bound) & (dataframe[column] <= upper_bound)]
    after = dataframe.shape[0]
    print(f"Removed outliers in {column}: {before - after} rows removed.")
    return dataframe


for col in ['Work_Hours_Per_Week', 'Monthly_Salary']:
    if col in df.columns:
        df = remove_outliers_iqr(df, col)
    else:
        print(f"Column '{col}' not found; skipping")

In [3]:
import pkg_resources
try:
    pkg_resources.get_distribution('xgboost')
    print("xgboost is installed")
except:
    print("xgboost is NOT installed")

xgboost is installed
