# **Data Preprocessing**

## **Libraries Importation**

In [3]:

import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# Library to split data
from sklearn.model_selection import train_test_split

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)


# Libraries different ensemble classifiers
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
)

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Libraries to get different metric scores
from sklearn import metrics
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# To tune different models
from sklearn.model_selection import GridSearchCV

## **Data Ingestion**

In [24]:
import os

def ingest_data(file_path: str):

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found at path: {file_path}")

    ext = os.path.splitext(file_path)[1].lower()

    try:
        if ext == '.csv':
            df = pd.read_csv(file_path)
        elif ext in ['.xls', '.xlsx']:
            df = pd.read_excel(file_path)
        elif ext == '.json':
            df = pd.read_json(file_path)
        elif ext == '.parquet':
            df = pd.read_parquet(file_path)
        elif ext == '.pkl':
            df = pd.read_pickle(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

        print(f"Successfully loaded data from '{file_path}'")
        print(f" Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None


df = ingest_data("C:\\Users\\Oshea\\Documents\\module_3\\examsim\\data_output\\cleaned_data.csv")

df.head(5)

Successfully loaded data from 'C:\Users\Oshea\Documents\module_3\examsim\data_output\cleaned_data.csv'
 Shape: 25480 rows × 11 columns



Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,asia,high school,n,n,14513,2007,west,592.2029,hour,y,0
1,asia,master's,y,n,2412,2002,northeast,83425.65,year,y,1
2,asia,bachelor's,n,y,44444,2008,west,122996.86,year,y,0
3,asia,bachelor's,n,n,98,1897,west,83434.03,year,y,0
4,africa,master's,y,n,1082,2005,south,149907.39,year,y,1


In [15]:
df.describe()

Unnamed: 0,no_of_employees,yr_of_estab,prevailing_wage,case_status
count,25480.0,25480.0,25480.0,25480.0
mean,5667.089207,1979.409929,74455.814592,0.667896
std,22877.917453,42.366929,52815.942327,0.470977
min,11.0,1800.0,2.1367,0.0
25%,1022.0,1976.0,34015.48,0.0
50%,2109.0,1997.0,70308.21,1.0
75%,3504.0,2005.0,107735.5125,1.0
max,602069.0,2016.0,319210.27,1.0


In [7]:
def fix_negative_employees(df, col='no_of_employees'):
 
    negative_count = (df[col] < 0).sum()
    print(f" Found {negative_count} negative values in '{col}'. Converting to absolute values...")
    
    df[col] = df[col].abs()
    
    print(" Negative values handled successfully.\n")
    return df


In [8]:
df = fix_negative_employees(df, col='no_of_employees')


 Found 33 negative values in 'no_of_employees'. Converting to absolute values...
 Negative values handled successfully.



In [10]:
(df['no_of_employees'] < 0).sum()


np.int64(0)

## **Handling Outliers**

In [None]:

def handle_outliers(df, target_col='case_status'):
   
    df_new = df.copy()
    
    # Select numeric columns excluding the target column
    numeric_cols = df_new.select_dtypes(include=np.number).columns.tolist()
    if target_col in numeric_cols:
        numeric_cols.remove(target_col)

    print(f"\nHandling outliers for {len(numeric_cols)} numeric columns (excluding '{target_col}'):\n")

    for col in numeric_cols:
        Q1 = df_new[col].quantile(0.25)
        Q3 = df_new[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Count outliers before capping
        outliers_below = (df_new[col] < lower_bound).sum()
        outliers_above = (df_new[col] > upper_bound).sum()
        total_outliers = outliers_below + outliers_above

        print(f"Column: '{col}'")
        print(f"   Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
        print(f"   Lower Bound: {lower_bound:.2f}, Upper Bound: {upper_bound:.2f}")
        print(f"   Outliers below: {outliers_below}, above: {outliers_above} (total: {total_outliers})")

        # Cap values
        df_new[col] = np.where(df_new[col] < lower_bound, lower_bound,
                                 np.where(df_new[col] > upper_bound, upper_bound, df_new[col]))
        
        print(f"  Outliers capped successfully.\n")

    print(" Outlier handling complete!\n")
    
    return df_new


In [None]:
df_new = handle_outliers(df, target_col='case_status')



Handling outliers for 3 numeric columns (excluding 'case_status'):

Column: 'no_of_employees'
   Q1: 1022.00, Q3: 3504.00, IQR: 2482.00
   Lower Bound: -2701.00, Upper Bound: 7227.00
   Outliers below: 0, above: 1556 (total: 1556)
  Outliers capped successfully.

Column: 'yr_of_estab'
   Q1: 1976.00, Q3: 2005.00, IQR: 29.00
   Lower Bound: 1932.50, Upper Bound: 2048.50
   Outliers below: 3260, above: 0 (total: 3260)
  Outliers capped successfully.

Column: 'prevailing_wage'
   Q1: 34015.48, Q3: 107735.51, IQR: 73720.03
   Lower Bound: -76564.57, Upper Bound: 218315.56
   Outliers below: 0, above: 427 (total: 427)
  Outliers capped successfully.

 Outlier handling complete!



## **Feature Engineering**

In [None]:

def feature_engineering(df_new,
                                   wage_col='prevailing_wage',
                                   wage_unit_col='unit_of_wage',
                                   estab_col='yr_of_estab',
                                   current_year=2024):
    

    # Conversion mapping
    conversion_map = {
        'Hour': 2080, 'hour': 2080, 'HOUR': 2080,
        'Week': 52,   'week': 52,   'WEEK': 52,
        'Month': 12,  'month': 12,  'MONTH': 12,
        'Year': 1,    'year': 1,    'YEAR': 1
    }

    print("\n Wage Normalization ...")

    # Apply conversion factor
    df_new['annual_wage'] = df_new[wage_col] * df_new[wage_unit_col].map(conversion_map)

    # Check if any unmatched unit values exist
    unmatched_units = df_new[df_new['annual_wage'].isna()][wage_unit_col].unique()
    if len(unmatched_units) > 0:
        print(f"Warning: Found unmatched wage units: {unmatched_units}")
        print("   Please update conversion_map accordingly.\n")
    else:
        print("Wage normalization completed successfully.\n")

    print(" Creating Company Age Feature...")

    df_new['company_age'] = current_year - df_new[estab_col]

    print(" Company age feature created.\n")
    
    # Drop original columns
    df_new.drop([wage_col, wage_unit_col, estab_col], axis=1, inplace=True)
    print(f" Dropped original columns: {wage_col}, {wage_unit_col}, {estab_col}\n")

    print(" Feature engineering complete!\n")

    return df_new


In [None]:
df_process = feature_engineering(df_new)
df_process.head()



 Wage Normalization ...
Wage normalization completed successfully.

 Creating Company Age Feature...
 Company age feature created.

 Dropped original columns: prevailing_wage, unit_of_wage, yr_of_estab

 Feature engineering complete!



Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,full_time_position,case_status,annual_wage,company_age
0,asia,high school,n,n,7227.0,west,y,0,1231782.032,17.0
1,asia,master's,y,n,2412.0,northeast,y,1,83425.65,22.0
2,asia,bachelor's,n,y,7227.0,west,y,0,122996.86,16.0
3,asia,bachelor's,n,n,98.0,west,y,0,83434.03,91.5
4,africa,master's,y,n,1082.0,south,y,1,149907.39,19.0


## **Feature Encoding**

In [None]:

def encode_categorical_features(df_new):
   

    print("\nStarting categorical feature encoding...\n")

    # ------------------------------
    # A. Ordinal Encoding (Education)
    # ------------------------------
    education_order = {
        'high school': 1,
        "bachelor's": 2,
        "master's": 3,
        'doctorate': 4
    }

    if 'education_of_employee' in df_new.columns:
        print(" Applying Ordinal Encoding on 'education_of_employee'...")
        df_new['education_of_employee'] = (
            df_new['education_of_employee']
            .str.lower()
            .map(education_order)
        )
        print("Ordinal Encoding complete.\n")

    # ------------------------------
    # B. Binary Encoding (Yes/No Columns)
    # ------------------------------
    binary_cols = ['has_job_experience', 'requires_job_training', 'full_time_position']
    existing_binary_cols = [col for col in binary_cols if col in df_new.columns]

    if existing_binary_cols:
        print(f"Applying Binary Encoding on: {existing_binary_cols}")
        for col in existing_binary_cols:
            df_new[col] = df_new[col].str.lower().map({'y': 1, 'n': 0})
        print("Binary Encoding complete.\n")

    # ------------------------------
    # C. One-Hot Encoding (Nominal)
    # ------------------------------
    nominal_cols = ['continent', 'region_of_employment']
    existing_nominal_cols = [col for col in nominal_cols if col in df_new.columns]

    if existing_nominal_cols:
        print(f"Applying One-Hot Encoding on: {existing_nominal_cols}")
        df_new = pd.get_dummies(df_new, columns=existing_nominal_cols, drop_first=True)
        print("One-Hot Encoding complete.\n")

    print("All categorical features encoded successfully!\n")

    return df_new


In [None]:
df_encoded = encode_categorical_features(df_new)
df_encoded


Starting categorical feature encoding...

 Applying Ordinal Encoding on 'education_of_employee'...
Ordinal Encoding complete.

Applying Binary Encoding on: ['has_job_experience', 'requires_job_training', 'full_time_position']
Binary Encoding complete.

Applying One-Hot Encoding on: ['continent', 'region_of_employment']
One-Hot Encoding complete.

All categorical features encoded successfully!



Unnamed: 0,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,prevailing_wage,unit_of_wage,full_time_position,case_status,continent_asia,continent_europe,continent_north america,continent_oceania,continent_south america,region_of_employment_midwest,region_of_employment_northeast,region_of_employment_south,region_of_employment_west
0,1,0,0,7227.0,2007.0,592.20290,hour,1,0,True,False,False,False,False,False,False,False,True
1,3,1,0,2412.0,2002.0,83425.65000,year,1,1,True,False,False,False,False,False,True,False,False
2,2,0,1,7227.0,2008.0,122996.86000,year,1,0,True,False,False,False,False,False,False,False,True
3,2,0,0,98.0,1932.5,83434.03000,year,1,0,True,False,False,False,False,False,False,False,True
4,3,1,0,1082.0,2005.0,149907.39000,year,1,1,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25475,2,1,1,2601.0,2008.0,77092.57000,year,1,1,True,False,False,False,False,False,False,True,False
25476,1,1,0,3274.0,2006.0,218315.56125,year,1,1,True,False,False,False,False,False,True,False,False
25477,3,1,0,1121.0,1932.5,146298.85000,year,0,1,True,False,False,False,False,False,False,True,False
25478,3,1,1,1918.0,1932.5,86154.77000,year,1,1,True,False,False,False,False,False,False,False,True


## **Feature Creation**

In [None]:

def feature_creation(df_new):
   
    df_new = df_new.copy()
    print("\n Starting Advanced Feature Engineering...\n")

    # ---------------------------------------
    # A1. Employee-to-Company Ratio (Scale & Demand)
    # ---------------------------------------
    if 'no_of_employees' in df_new.columns:
        print(" Creating 'request_ratio' feature...")
        df_new['request_ratio'] = 1 / (df_new['no_of_employees'] + 1e-6)
        print(" request_ratio created.\n")

    # ---------------------------------------
    # A2. Company Age Bins (Non-Linearity)
    # ---------------------------------------
    if 'company_age' in df_new.columns:
        print(" Creating 'company_age_bin' feature...")
        bins = [-1, 5, 20, np.inf]
        labels = ['Startup', 'Established', 'Legacy']
        df_new['company_age_bin'] = pd.cut(df_new['company_age'], bins=bins, labels=labels)
        print(" company_age_bin created.\n")

    # ---------------------------------------
    # A3. Wage Quartile (Relative Market Position)
    # ---------------------------------------
    if 'annual_wage' in df_new.columns:
        print(" Creating 'wage_quartile' feature...")
        try:
            df_new['wage_quartile'] = pd.qcut(df_new['annual_wage'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
        except ValueError:
            print(" Not enough unique wage values for quartile bins — skipping.")
        print(" wage_quartile created.\n")

    # ---------------------------------------
    # B1. Readiness Score (Sum of Experience & Training)
    # ---------------------------------------
    if {'has_job_experience', 'requires_job_training'}.issubset(df_new.columns):
        print(" Creating 'readiness_score' feature...")
        df_new['readiness_score'] = df_new['has_job_experience'] + df_new['requires_job_training']
        print(" readiness_score created.\n")

        # ---------------------------------------
        # B2. Training Investment Flag
        # ---------------------------------------
        print(" Creating 'high_investment_flag' feature...")
        df_new['high_investment_flag'] = np.where(
            (df_new['has_job_experience'] == 0) & (df_new['requires_job_training'] == 1),
            1,
            0
        )
        print(" high_investment_flag created.\n")

        # ---------------------------------------
        # B3. Expertise Level (Categorical)
        # ---------------------------------------
        print(" Creating 'expertise_level' categorical feature...")
        expertise_map = {
            0: 'Novice',
            1: 'Standard',
            2: 'Specialist'
        }
        df_new['expertise_level'] = df_new['readiness_score'].map(expertise_map)
        print(" expertise_level created.\n")

    # ---------------------------------------
    # C. One-Hot Encoding for new categorical bins
    # ---------------------------------------
    cat_cols = ['company_age_bin', 'wage_quartile', 'expertise_level']
    existing_cat_cols = [col for col in cat_cols if col in df_new.columns]

    if existing_cat_cols:
        print(f" Applying One-Hot Encoding to: {existing_cat_cols}")
        df_new = pd.get_dummies(df_new, columns=existing_cat_cols, drop_first=True)
        print(" One-Hot Encoding complete.\n")

    print(" Advanced Feature Engineering Completed Successfully!\n")
    return df_new


In [31]:
df_engineered = feature_creation(df_encoded)
df_engineered.head(5)


 Starting Advanced Feature Engineering...

 Creating 'request_ratio' feature...
 request_ratio created.

 Creating 'readiness_score' feature...
 readiness_score created.

 Creating 'high_investment_flag' feature...
 high_investment_flag created.

 Creating 'expertise_level' categorical feature...
 expertise_level created.

 Applying One-Hot Encoding to: ['expertise_level']
 One-Hot Encoding complete.

 Advanced Feature Engineering Completed Successfully!



Unnamed: 0,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,prevailing_wage,unit_of_wage,full_time_position,case_status,continent_asia,continent_europe,continent_north america,continent_oceania,continent_south america,region_of_employment_midwest,region_of_employment_northeast,region_of_employment_south,region_of_employment_west,request_ratio,readiness_score,high_investment_flag,expertise_level_Specialist,expertise_level_Standard
0,1,0,0,7227.0,2007.0,592.2029,hour,1,0,True,False,False,False,False,False,False,False,True,0.000138,0,0,False,False
1,3,1,0,2412.0,2002.0,83425.65,year,1,1,True,False,False,False,False,False,True,False,False,0.000415,1,0,False,True
2,2,0,1,7227.0,2008.0,122996.86,year,1,0,True,False,False,False,False,False,False,False,True,0.000138,1,1,False,True
3,2,0,0,98.0,1932.5,83434.03,year,1,0,True,False,False,False,False,False,False,False,True,0.010204,0,0,False,False
4,3,1,0,1082.0,2005.0,149907.39,year,1,1,False,False,False,False,False,False,False,True,False,0.000924,1,0,False,True


## **Feature Transformation and Scaling**

In [34]:
from sklearn.preprocessing import StandardScaler

def transform_and_scale_features(df_new):

    print("\Starting feature transformation and scaling...\n")

    # Identify numerical columns
    numeric_cols = ['no_of_employees', 'company_age', 'annual_wage']
    existing_numeric_cols = [col for col in numeric_cols if col in df_new.columns]

    # ------------------------------
    # A. Log Transformation
    # ------------------------------
    log_transform_cols = [col for col in ['no_of_employees', 'annual_wage'] if col in df_new.columns]
    for col in log_transform_cols:
        print(f"Applying log transformation to '{col}' to reduce skewness...")
        df_new[col] = np.log1p(df_new[col].clip(lower=0))  # clip to avoid negatives
    print("Log transformation complete.\n")

    # ------------------------------
    # B. Standard Scaling
    # ------------------------------
    if existing_numeric_cols:
        print(f"Applying StandardScaler to columns: {existing_numeric_cols}")
        scaler = StandardScaler()
        df_new[existing_numeric_cols] = scaler.fit_transform(df_new[existing_numeric_cols])
        print("Standard scaling complete.\n")

    print(" Numerical features transformed and standardized successfully!\n")

    return df_new

In [37]:
df_transformed = transform_and_scale_features(df_engineered)
df_transformed.head()

\Starting feature transformation and scaling...

Applying log transformation to 'no_of_employees' to reduce skewness...
Log transformation complete.

Applying StandardScaler to columns: ['no_of_employees']
Standard scaling complete.

 Numerical features transformed and standardized successfully!



Unnamed: 0,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,prevailing_wage,unit_of_wage,full_time_position,case_status,continent_asia,continent_europe,continent_north america,continent_oceania,continent_south america,region_of_employment_midwest,region_of_employment_northeast,region_of_employment_south,region_of_employment_west,request_ratio,readiness_score,high_investment_flag,expertise_level_Specialist,expertise_level_Standard
0,1,0,0,2.002815,2007.0,592.2029,hour,1,0,True,False,False,False,False,False,False,False,True,0.000138,0,0,False,False
1,3,1,0,0.048154,2002.0,83425.65,year,1,1,True,False,False,False,False,False,True,False,False,0.000415,1,0,False,True
2,2,0,1,2.002815,2008.0,122996.86,year,1,0,True,False,False,False,False,False,False,False,True,0.000138,1,1,False,True
3,2,0,0,-0.940584,1932.5,83434.03,year,1,0,True,False,False,False,False,False,False,False,True,0.010204,0,0,False,False
4,3,1,0,-0.940584,2005.0,149907.39,year,1,1,False,False,False,False,False,False,False,True,False,0.000924,1,0,False,True


## **Data Saving**

In [38]:
import os

def save_data_copy(df: pd.DataFrame, file_name: str = "cleaned_data_PP.csv", folder: str = "data_output"):
   
    # Ensure folder exists
    os.makedirs(folder, exist_ok=True)

    # Construct full path
    file_path = os.path.join(folder, file_name)

    # Save based on file extension
    ext = os.path.splitext(file_name)[1].lower()

    try:
        if ext == ".csv":
            df.to_csv(file_path, index=False)
        elif ext in [".xlsx", ".xls"]:
            df.to_excel(file_path, index=False)
        elif ext == ".parquet":
            df.to_parquet(file_path, index=False)
        else:
            raise ValueError("Unsupported file format. Use .csv, .xlsx, or .parquet")

        print(f"Data saved successfully at: {file_path}")
        return file_path

    except Exception as e:
        print(f"Failed to save data: {e}")
        return None


In [39]:
save_data_copy(df_transformed)

Data saved successfully at: data_output\cleaned_data_PP.csv


'data_output\\cleaned_data_PP.csv'