# 03 Feature Engineering

## a. Key Considerations from Exploratory Data Analysis

In [1]:
import os

os.chdir("..")
print("Current working dir:", os.getcwd())
#print("Files in raw folder:", os.listdir("data/raw"))

Current working dir: C:\Users\Window\Desktop\Everything_Data_Mentorship\mentorship_ds_project


## b. Imports 

In [2]:
import pandas as pd


## c. Loading Data 

In [3]:
df = pd.read_csv("data/interim/cleaned_df.csv")

In [4]:
df.head()

Unnamed: 0,timestamp,age_range,gender,country,referral_source,years_experience,track_applied,weekly_commitment_hours,main_aim,motivation,skill_level,aptitude_test_completed,total_score,graduated
0,2024-12-01 23:50:47,18-24 years,Male,Kenya,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,to enter into the data analysis career,Beginner,Yes,58.67,No
1,2024-12-03 09:35:19,25-34 years,Male,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,To grow and improve my skills in data science ...,Elementary,Yes,70.0,No
2,2024-12-03 19:16:49,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,I’m motivated to join Everything Data to enhan...,Intermediate,Yes,64.33,Yes
3,2024-12-03 12:52:36,18-24 years,Female,Kenya,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,I'd like to upskill and Join the Data Community,Intermediate,Yes,75.0,No
4,2024-12-03 18:12:27,18-24 years,Male,Kenya,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,I aim to join the mentorship program to enhanc...,Beginner,Yes,59.0,No


## d. Dropping Unnecessary features

Due to class imbalance in the country column, I opted to drop the feature from the dataset.

Due to feature irrelevance, I dropped the timestamp column too.

In [5]:
df = df.drop(["timestamp", "country", "motivation"], axis=1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age_range                115 non-null    object 
 1   gender                   115 non-null    object 
 2   referral_source          115 non-null    object 
 3   years_experience         115 non-null    object 
 4   track_applied            115 non-null    object 
 5   weekly_commitment_hours  115 non-null    object 
 6   main_aim                 115 non-null    object 
 7   skill_level              114 non-null    object 
 8   aptitude_test_completed  115 non-null    object 
 9   total_score              115 non-null    float64
 10  graduated                115 non-null    object 
dtypes: float64(1), object(10)
memory usage: 10.0+ KB


## e. Handle Missing Data

In [7]:
df['skill_level'] = df['skill_level'].fillna(df['skill_level'].mode()[0])

In [8]:
df['skill_level'].value_counts()

skill_level
Elementary      57
Beginner        42
Intermediate    16
Name: count, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age_range                115 non-null    object 
 1   gender                   115 non-null    object 
 2   referral_source          115 non-null    object 
 3   years_experience         115 non-null    object 
 4   track_applied            115 non-null    object 
 5   weekly_commitment_hours  115 non-null    object 
 6   main_aim                 115 non-null    object 
 7   skill_level              115 non-null    object 
 8   aptitude_test_completed  115 non-null    object 
 9   total_score              115 non-null    float64
 10  graduated                115 non-null    object 
dtypes: float64(1), object(10)
memory usage: 10.0+ KB


## f. Correct data Types

I converted categorical/object columns to category dtype.

In [10]:
# I created a function to wrap it all and ease the conversion
def set_ordered_categories(df):
    """
    Converts specific columns of df to ordered categorical types for visualization.
    Returns a modified copy of the DataFrame.
    """

    # Define categories
    experience_categories = [
        "Less than six months",
        "6 months - 1 year",
        "1-3 years",
        "4-6 years"
    ]
    commitment_categories = ["less than 6 hours", "7-14 hours", "more than 14 hours"]
    skill_level_categories = ["Beginner", "Elementary", "Intermediate"]
    age_categories = ["18-24 years", "25-34 years", "35-44 years", "45-54 years"]

    # Map skill levels to shorter labels
    skill_map = {
        "Beginner - I have NO learning or work experience in data analysis/ data science": "Beginner",
        "Elementary - I have theoretical understanding of basic data analysis/ data science concepts": "Elementary",
        "Intermediate - I have theoretical knowledge and experience in data analysis/ data science": "Intermediate"
    }

    # Apply transformations
    df["years_experience"] = pd.Categorical(
        df["years_experience"], categories=experience_categories, ordered=True
    )
    df["weekly_commitment_hours"] = pd.Categorical(
        df["weekly_commitment_hours"], categories=commitment_categories, ordered=True
    )
    df["skill_level"] = pd.Categorical(
        df["skill_level"].replace(skill_map),
        categories=skill_level_categories,
        ordered=True
    )
    df["age_range"] = pd.Categorical(
        df["age_range"], categories=age_categories, ordered=True
    )

    return df

In [11]:
df = set_ordered_categories(df)

In [12]:
df['skill_level'].value_counts(dropna=False)

skill_level
Elementary      57
Beginner        42
Intermediate    16
Name: count, dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age_range                115 non-null    category
 1   gender                   115 non-null    object  
 2   referral_source          115 non-null    object  
 3   years_experience         115 non-null    category
 4   track_applied            115 non-null    object  
 5   weekly_commitment_hours  115 non-null    category
 6   main_aim                 115 non-null    object  
 7   skill_level              115 non-null    category
 8   aptitude_test_completed  115 non-null    object  
 9   total_score              115 non-null    float64 
 10  graduated                115 non-null    object  
dtypes: category(4), float64(1), object(6)
memory usage: 7.5+ KB


## g. For the target I'll apply binary encoding. 
This is the graduated column

In [14]:
#Graduated column encoding 
df['graduated'] = df['graduated'].map({'No': 0, 'Yes': 1})
df.head()

Unnamed: 0,age_range,gender,referral_source,years_experience,track_applied,weekly_commitment_hours,main_aim,skill_level,aptitude_test_completed,total_score,graduated
0,18-24 years,Male,Word of mouth,Less than six months,Data science,less than 6 hours,Upskill,Beginner,Yes,58.67,0
1,25-34 years,Male,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,Elementary,Yes,70.0,0
2,18-24 years,Female,WhatsApp,6 months - 1 year,Data science,more than 14 hours,Upskill,Intermediate,Yes,64.33,1
3,18-24 years,Female,WhatsApp,6 months - 1 year,Data science,7-14 hours,Upskill,Intermediate,Yes,75.0,0
4,18-24 years,Male,WhatsApp,Less than six months,Data science,7-14 hours,Upskill,Beginner,Yes,59.0,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age_range                115 non-null    category
 1   gender                   115 non-null    object  
 2   referral_source          115 non-null    object  
 3   years_experience         115 non-null    category
 4   track_applied            115 non-null    object  
 5   weekly_commitment_hours  115 non-null    category
 6   main_aim                 115 non-null    object  
 7   skill_level              115 non-null    category
 8   aptitude_test_completed  115 non-null    object  
 9   total_score              115 non-null    float64 
 10  graduated                115 non-null    int64   
dtypes: category(4), float64(1), int64(1), object(5)
memory usage: 7.5+ KB


In [16]:
df['main_aim'].value_counts(dropna=False)

main_aim
Upskill         74
Learn afresh    23
Portfolio       15
Networking       2
Other            1
Name: count, dtype: int64

## h. Pefrom a train test split

In [17]:
from sklearn.model_selection import train_test_split

#set the features and target variables
X = df.drop('graduated', axis=1)
y = df['graduated']

#Peform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## i. Encode Categorical Variables

Cosiderations

Ordinal Encoding for ordered categorical data. These columns are age_range, years_experience, weekly_commitment_hours and skill_level

For norminal features, we'll apply one hot encoding. These columns include; gender, referral_source, track_applied, main_aim, aptitude_test_completed



In [18]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer


#Identify column groups
ordinal_cols = ['age_range', 'years_experience', 'weekly_commitment_hours', 'skill_level']
nominal_cols = ['gender', 'referral_source', 'track_applied', 'main_aim', 'aptitude_test_completed']

#For OrdinalEncoder, I'll preserve the defined category order from the DataFrame
# I'll extract the categories directly from the categorical dtype
ordinal_categories = [X_train[col].cat.categories.tolist() for col in ordinal_cols]

# Build the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'), nominal_cols)
    ],
    remainder='passthrough'  # Keep total_score without change
)

#Fit on training data only, transform both sets
X_train_final = preprocessor.fit_transform(X_train)
X_test_final  = preprocessor.transform(X_test)

# Optional: get feature names for inspection
oh_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(nominal_cols)
feature_names = ordinal_cols + list(oh_names) + ['total_score']

In [19]:
oh_names

array(['gender_Male', 'referral_source_Instagram',
       'referral_source_LinkedIn', 'referral_source_Twitter',
       'referral_source_WhatsApp', 'referral_source_Word of mouth',
       'referral_source_through a geeks for geeks webinar',
       'track_applied_Data science', 'main_aim_Networking',
       'main_aim_Other', 'main_aim_Portfolio', 'main_aim_Upskill',
       'aptitude_test_completed_Yes'], dtype=object)

In [20]:
feature_names

['age_range',
 'years_experience',
 'weekly_commitment_hours',
 'skill_level',
 'gender_Male',
 'referral_source_Instagram',
 'referral_source_LinkedIn',
 'referral_source_Twitter',
 'referral_source_WhatsApp',
 'referral_source_Word of mouth',
 'referral_source_through a geeks for geeks webinar',
 'track_applied_Data science',
 'main_aim_Networking',
 'main_aim_Other',
 'main_aim_Portfolio',
 'main_aim_Upskill',
 'aptitude_test_completed_Yes',
 'total_score']

## j. Save the processor ready for use in modeling

In [22]:
import joblib

# Save the fitted preprocessor
joblib.dump(preprocessor, 'artifacts/preprocessor.joblib')

# Saving the splits
X_train.to_csv('data/processed/X_train.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
y_train.to_csv('data/processed/y_train.csv', index=False)
y_test.to_csv('data/processed/y_test.csv', index=False)