# Education & Career Success Analysis using Iterative Soft-thresholding for LASSO

Load Data and Accessing CSV



In [9]:
import pandas as pd
#!wget https://www.kaggle.com/datasets/adilshamim8/education-and-career-success?resource=download  -O dataset.zip
url = 'https://raw.githubusercontent.com/AlanZhong35/Math535_Project_CareerData/refs/heads/master/education_career_success.csv'
df = pd.read_csv(url)
df.head(10) #print first 5 rows to verify correctness

Unnamed: 0,Student_ID,Age,Gender,High_School_GPA,SAT_Score,University_Ranking,University_GPA,Field_of_Study,Internships_Completed,Projects_Completed,Certifications,Soft_Skills_Score,Networking_Score,Job_Offers,Starting_Salary,Career_Satisfaction,Years_to_Promotion,Current_Job_Level,Work_Life_Balance,Entrepreneurship
0,S00001,24,Male,3.58,1052,291,3.96,Arts,3,7,2,9,8,5,27200.0,4,5,Entry,7,No
1,S00002,21,Other,2.52,1211,112,3.63,Law,4,7,3,8,1,4,25000.0,1,1,Mid,7,No
2,S00003,28,Female,3.42,1193,715,2.63,Medicine,4,8,1,1,9,0,42400.0,9,3,Entry,7,No
3,S00004,25,Male,2.43,1497,170,2.81,Computer Science,3,9,1,10,6,1,57400.0,7,5,Mid,5,No
4,S00005,22,Male,2.08,1012,599,2.48,Engineering,4,6,4,10,9,4,47600.0,9,5,Entry,2,No
5,S00006,24,Male,2.4,1600,631,3.78,Law,2,3,2,2,2,1,68400.0,9,2,Entry,8,Yes
6,S00007,27,Male,2.36,1011,610,3.83,Computer Science,0,1,3,3,3,2,55500.0,7,4,Mid,3,No
7,S00008,20,Male,2.68,1074,240,2.84,Computer Science,1,5,5,5,1,2,38000.0,2,3,Entry,3,No
8,S00009,24,Male,2.84,1201,337,3.31,Business,2,3,0,5,5,2,68900.0,2,2,Entry,2,No
9,S00010,28,Male,3.02,1415,138,2.33,Computer Science,1,5,3,10,2,0,58900.0,4,2,Senior,2,No


Data Preprocessing

In [10]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [11]:
#target_col = "Job_Offers"
target_col = "Starting_Salary"

In [12]:
# Select categorical columns
categorical_cols = ['Gender', 'Field_of_Study']

# One-hot encode categorical features
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)  # Avoid dummy variable trap
# Define target variable (Job_Offers)
y = df[target_col]

# Define feature variables (drop target and Student_ID, and other career metrics)
drops = ["Job_Offers", "Student_ID", "Entrepreneurship", "Current_Job_Level", "Career_Satisfaction", "Years_to_Promotion", "Starting_Salary", "Work_Life_Balance"]
X = df.drop(drops, axis=1)

Perform Analysis on Job Offers

In [13]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for Lasso)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1)).flatten()

In [14]:
# ISTA implementation for Lasso regression
def soft_thresholding(x, alpha,lambda_):
    """Applies soft-thresholding to shrink values toward zero."""
    return np.sign(x) * np.maximum(np.abs(x) - alpha*lambda_, 0)

def ista(X, y, lambda_, alpha=1e-3, max_iter=1000, tol=1e-6):
    m, n = X.shape
    beta = np.zeros(n)

    for _ in range(max_iter):
        gradient = X.T @ (X @ beta - y) / m
        beta_new = soft_thresholding(beta - alpha * gradient, alpha, lambda_)

        if np.linalg.norm(beta_new - beta, ord=2) < tol:
            break

        beta = beta_new

    return beta

In [15]:
# Train ISTA-based Lasso model
lambda_ = 0.005  # Adjust this value for regularization
beta_ista = ista(X_train_scaled, y_train_scaled, lambda_)

In [16]:
# Display feature importance
feature_importance = pd.Series(beta_ista, index=df.drop(drops, axis=1).columns)
sorted_features = feature_importance.abs().sort_values(ascending=False)
print(f"Feature Importance (ISTA Lasso) for predicting {target_col}:\n", sorted_features)

Feature Importance (ISTA Lasso) for predicting Starting_Salary:
 Internships_Completed              0.013612
Field_of_Study_Law                 0.009995
University_Ranking                 0.009057
Certifications                     0.008917
Field_of_Study_Medicine            0.008152
Field_of_Study_Business            0.003515
Soft_Skills_Score                  0.002962
Gender_Male                        0.002493
High_School_GPA                    0.001999
University_GPA                     0.001986
Projects_Completed                 0.000467
Field_of_Study_Computer Science    0.000261
Age                                0.000000
SAT_Score                          0.000000
Networking_Score                   0.000000
Gender_Other                       0.000000
Field_of_Study_Engineering         0.000000
Field_of_Study_Mathematics         0.000000
dtype: float64
